
    /;ji-B                        d dl mZ d dlZej        d          dk     r" edej                    ej        d           d dlZd dlZd dl	Z
d dlZd dlmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZ  G d d          Z G d d          Z G d d          Z G d d          Zd"dZ	 d#dZ d Z!d Z"d Z#	 	 	 	 d#dZ$	 	 	 	 	 d$d!Z%dS )%    )print_functionN   zpkuseg does not support python2)file   )ProcessQueue)trainer)	inference)config)FeatureExtractor)Model)download_model)Postagc                       e Zd ZdZd ZdS )TrieNodeu   建立词典的Trie树节点c                 0    || _         d| _        i | _        d S )N )iswordusertagchildren)selfr   s     O/root/voice-cloning/.venv/lib/python3.11/site-packages/spacy_pkuseg/__init__.py__init__zTrieNode.__init__   s        N)__name__
__module____qualname____doc__r    r   r   r   r      s)        &&    r   r   c                   $    e Zd ZdZd Zd Zd ZdS )Preprocesseru3   预处理器，在用户词典中的词强制分割c                 
   |g }|| _         t          |t                    rt          |d          5 }|                                }ddd           n# 1 swxY w Y   t          d          | _        |D ]}|                                                    d          }|d                                         }t          |          dk    r|d                                         nd}| 
                    ||           dS t          d          | _        |D ]m}t          |t                    r|                                }	d}
n)t          |          d	k    sJ t          d
 |          \  }	}
| 
                    |	|
           ndS )u   初始化建立Trie树Nutf-8encodingF	r   r   r      c                 *    |                                  S N)stripxs    r   <lambda>z'Preprocesser.__init__.<locals>.<lambda>:   s    		 r   )	dict_data
isinstancestropen	readlinesr   trier*   splitleninsertmap)r   	dict_fileflineslinefieldswordr   w_twts              r   r   zPreprocesser.__init__$   s   I"i%% 	"i'222 &a& & & & & & & & & & & & & & & DI + +++D11ay((/26{{Q&)//+++BD'****	+ + !DI  " "c3'' 8		AAAs88Q;;;;11377DAqAq!!!!" "s   AAAc                     t          |          }| j        }t          |          D ]7}||         }||j        vrt	          d          |j        |<   |j        |         }8d|_        ||_        dS )u   Trie树中插入单词FTN)r5   r3   ranger   r   r   r   )r   r=   r   lnowics          r   r6   zPreprocesser.insert=   so    IIiq 	" 	"AQA$$"*5//Q,q/CC
r   c                 0   g }g }g }t          |          }d}d}||k     r%| j        }|}	d}
d}d}	 ||	         }||j        vr	|dk    rd}
nT||j        vr|dk    rnD|j        |         }|j        r	|	}|j        }|	dz  }	|	|k    r|dk    rn|	|k    r|dk    r|dz   }	d}
nn|
r||k    rG|                    |||                    |                    d           |                    d           |                    |||	                    |                    d           |                    |           |	}|	}n|dz  }||k     %||k     rG|                    |||                    |                    d           |                    d           |||fS )u   对文本进行预处理r   Fr   Tr   )r5   r3   r   r   r   append)r   txtoutlstiswlsttaglstrC   lastrE   rD   jfoundr   last_word_idxrF   s                 r   solvezPreprocesser.solveI   s   HH!ee)CAEGMFCL((]b-@-@ ECL((]b-@-@l1o: *$%M!kGQ66mr1166mr11%)A E#$  199MM#d1f+...MM%(((MM"%%%c!A#h'''d###g&&&QG !eeH !88MM#d1f+&&&MM%   MM"vv%%r   N)r   r   r   r   r   r6   rR   r   r   r   r!   r!   !   sG        ==" " "2
 
 
0& 0& 0& 0& 0&r   r!   c                   $    e Zd ZdZd Zd Zd ZdS )Postprocesseru   对分词结果后处理c                    ||	d| _         d S d| _         |t                      | _        nut          |d          5 }t	          j        |                                                              d          }d d d            n# 1 swxY w Y   t          |          | _        |t                      | _        d S t                      | _        |D ]}t          |d          5 }t	          j        |                                                              d          }d d d            n# 1 swxY w Y   | j        	                    t          |                     d S )NFTrb
)

do_processsetcommon_wordsr1   pklloadr*   r4   other_wordsupdate)r   common_nameother_namesr9   	all_words
other_names         r   r   zPostprocesser.__init__}   s   ;#6#DOF #D
 k4(( <AHQKK--//55d;;	< < < < < < < < < < < < < < < #ID"uuD"uuD) 8 8
 *d++ @q # 1 1 3 3 9 9$ ? ?I@ @ @ @ @ @ @ @ @ @ @ @ @ @ @ ''I77778 8s#   :BBB:D$$D(	+D(	c                 
    t          t          dd                    D ]}t          |          |z
  }|dk     rd}||dz   k     rd                    ||||z                      }| j        v rd}n>| j        v r3|r't           fd||||z            D                       }nd}|rd}nd}nd}|rCt          |          D ]}	||= |                    ||           |dz  }t          |          |z
  }n|dz  }||dz   k     |S )	Nr'      r   r   r   Tc              3   <   K   | ]}|j         v p|j        v V  d S r)   )rZ   r]   ).0r?   r   s     r   	<genexpr>z-Postprocesser.post_process.<locals>.<genexpr>   sW       (N (N<= +,t/@*@ *7 !T%5 5(N (N (N (N (N (Nr   F)reversedrB   r5   joinrZ   r]   allr6   )
r   sentcheck_seperatedmendrE   merged_wordsdo_seg	seperatedks
   `         r   post_processzPostprocesser.post_process   sy   %1++&& 	 	Ad))A+CQwwAsQw;;!wwtAacE{334#444!FF!T%555& *$' (N (N (N (NAEa!e(N (N (N %N %N		 %*	  &!&!%"F "1XX $ $ GGKK<000FAd))a-CCFA/ sQw;;0 r   c                 B    | j         s|S |                     |d          S )NT)rl   )rX   rs   )r   rk   s     r   __call__zPostprocesser.__call__   s*     	K  t <<<r   N)r   r   r   r   r   rs   ru   r   r   r   rT   rT   {   sH        ""8 8 84  @= = = = =r   rT   c                   "    e Zd ZddZd Zd ZdS )	pkusegspacy_ontonotesdefaultFc                    || _         |t          j        v rpt          j                            t          j        |          t          _        t          t          j	        |         t          j        t          j
        |                    n|t          _        |d}d}n|t          j        dgz   vr|}nd}|t          j        v rt          j                            t          j        ||dz             }t          j                            t          j                            t          j                            t                              dd          }||g}nct          j                            t          j                            t          j                            t                              dd          }|g}t          |          | _        t#          d|          | _        t'          j                    | _        t-          j                    | _        d | j        j                                        D             | _        t7          | j        j                  | _        t7          | j        j                  | _        |r{t          t          j	        d         t          j        t          j
        d                    t          j                            t          j        d          }t?          |          | _         dS dS )u-   初始化函数，加载模型及用户词典Nry   z	_dict.pkldictszdefault.pklc                     i | ]\  }}||	S r   r   )rf   tagidxs      r   
<dictcomp>z#pkuseg.__init__.<locals>.<dictcomp>   s+     
 
 
!cC
 
 
r   postag)!r   r   available_modelsospathri   pkuseg_homemodelDirr   
model_urls
model_hashmodels_with_dictdirnamerealpath__file__r!   preprocesserrT   postprocesserr   r\   feature_extractorr   model
tag_to_idxitems
idx_to_tagr5   feature_to_idx	n_featuren_tagr   tagger)	r   
model_name	user_dictr   	file_namer`   rb   default_name
postag_dirs	            r   r   zpkuseg.__init__   sj   
 000 gll" FO 6,Z8&:LfN_`jNkllll(FOIKK 79+ EEE%		 	V444W\\&{* 

  "w||GOOBG$4$4X$>$>??]     *<8!w||GOOBG$4$4X$>$>??]     ,n(33*4==!1!6!8!8Z\\

 
%)%;%F%L%L%N%N
 
 
 T3BCC/:;;
 	-6,X68JFL]^fLghhh" J !,,DKKK	- 	-r   c                     t          | j                            |                    }t          |          }g }t	          |          D ]2}| j                            ||          }|                    |           3t          j        || j	                  \  }}g }	d}
d}t          ||          D ]8\  }}|r|}
d}d| j        |         v r|	                    |
           |}
3|
|z  }
9|
r|	                    |
           |	S )u'   
        直接对文本分词
        NTFB)listr   normalize_textr5   rB   get_node_features_idxrI   _infdecodeViterbi_fastr   zipr   )r   textexampleslengthall_featurer~   node_feature_idx_tagswordscurrent_wordis_startr}   chars                 r   _cutzpkuseg._cut   s+   
 .==dCCDDX== 	1 	1C#5KKX    /0000)+tzBB4T4 	% 	%IC %# ,,,\***#$ 	'LL&&&r   c                    |                                 }g }g }|s|S |                                }|D ]}|s| j                            |          \  }}}t	          |||          D ]\  }	}
}|
r+|                    |	           |                    |           3|                     |	          }|                     |          }|                    |           |                    dgt          |          z             | j
        re| j                            |                                          }t          |          D ]\  }}|r|||<   t          t	          ||                    }|S )u   分词，结果返回一个listr   )r*   r4   r   rR   r   rI   r   r   extendr5   r   r   r}   copy	enumerater   )r   rJ   retusertagsimaryw0lstr   rM   r?   iswr   outputpost_outputr   rE   s                   r   cutz
pkuseg.cut,  s    iikk 	J		  	7 	7B  #'"3"9"9""="=C#&sFF#;#; 	7 	73 JJqMMMOOG,,,1"0088

;'''S%5%5 56666	7 ; 	';??388::..D'11 & &
7 &%DGs3~~&&C
r   N)rx   ry   F)r   r   r   r   r   r   r   r   r   rw   rw      sH        =- =- =- =-B, , ,\& & & & &r   rw      c                 t   t          j                     }t          j                            |           st	          d          t          j                            |          st	          d          t          j                            t
          j                  st          j        t
          j                   t          j                            t
          j        dz             s!t          j        t
          j        dz              | t
          _	        |t
          _
        |t
          _        dt
          _        |t
          _        |t
          _        t          j        t
          j        d           t          j        t
                     t#          dt%          t          j                     |z
            z              dS )	u   用于训练模型ztrainfile does not exist.ztestfile does not exist.z/outputr   T)exist_okzTotal time: N)timer   r   exists	Exceptionr   tempFilemakedirsmkdir	trainFiletestFiler   nThreadttlIter
init_modelr	   trainprintr0   )r   r   savedir
train_iterr   	starttimes         r   r   r   U  s8    	I7>>)$$ 534447>>(## 423337>>&/** %
FO$$$7>>&/I566 .
9,--- FFOFOFNFN"FK$////M& 
.3ty{{Y677
788888r   ry   Fc                    g }|                     t          j                               t          |||          }|                     t          j                               t          j                            |           s"t          d                    |                     t          | dd          5 }|	                                }	d d d            n# 1 swxY w Y   |                     t          j                               g }
|	D ]}|s<|
                     d
                    |                    |                               @|
                     d
                    t          d |                    |                                         |                     t          j                               t          |dd          5 }|                    d	
                    |
                     d d d            n# 1 swxY w Y   |                     t          j                               t          d
                    |d         |d         z
                       |reg d}t          |d t          |d d         |dd                    D                       D ]*\  }}t          d                    ||                     )d S d S )Nr   input_file {} does not exist.rr#   r$    c                 ,    d                     |           S N/ri   r+   s    r   r-   z#_test_single_proc.<locals>.<lambda>  s    ! r   r?   rW   total_time:	{:.3f}rH   r   )
load_model	read_fileword_seg
write_filec                     g | ]
\  }}||z
  S r   r   rf   startrn   s      r   
<listcomp>z%_test_single_proc.<locals>.<listcomp>       FFFZUCS5[FFFr   r   
{}:	{:.3f})rI   r   rw   r   r   r   r   formatr1   r2   ri   r   r7   writer   r   )
input_fileoutput_filer   r   r   verbosetimessegr9   r:   resultsr;   	time_strskeyvalues                  r   _test_single_procr   s  s    E	LL
Yv
6
6
6C	LL7>>*%% L7>>zJJKKK	j#	0	0	0 A               
LLG O O 	ONN388CGGDMM223333NN388C(<(<cggdmm$L$LMMNNNN	LL	k3	1	1	1 $Q			'""###$ $ $ $ $ $ $ $ $ $ $ $ $ $ $	LL	

&
&uRy58';
<
<=== 4III	FF3uSbSz59+E+EFFF
 
 	4 	4JC -&&sE2233334 4	4 	4s$   4CCC)HH	Hc                     t          ||          D ][}||                                         }|                     |          }|                    |d                    |          f           \d S )Nr   )rB   r*   r   putri   )r   r:   r   rn   qrE   rC   r   s           r   _proc_deprecatedr     sk    5# " "!HNNggajj	q#((3-- !!!!" "r   c                 <   	 |                                 }|d S |\  }}| j        s)d                    |                     |                    }n7d                    t	          d |                     |                              }|                    ||f           )NTr   c                 ,    d                     |           S r   r   r+   s    r   r-   z_proc.<locals>.<lambda>      sxx{{ r   )getr   ri   r   r7   r   )r   in_queue	out_queueitemr~   r;   
output_strs          r   _procr     s    	)||~~<F	Tz 	L#''$--00JJ#&:&:CGGDMM"J"JKKJsJ'(((	)r   c                 V   t          | ||          }	 |                                }|d S |\  }}|s)d                    |                    |                    }	n7d                    t	          d |                    |                              }	|                    ||	f           )Nr   Tr   c                 ,    d                     |           S r   r   r+   s    r   r-   z_proc_alt.<locals>.<lambda>  r   r   )rw   r   ri   r   r7   r   )
r   r   r   r   r   r   r   r~   r;   r   s
             r   	_proc_altr     s    
Yv
6
6
6C	)||~~<F	T 	L#''$--00JJ#&:&:CGGDMM"J"JKKJsJ'(((	)r   c           
         t          j                    dk    }g }|                    t          j                               |rd }	nt	          |||          }	|                    t          j                               t
          j                            |           s"t          d	                    |                     t          | dd          5 }
|
                                }d d d            n# 1 swxY w Y   |                    t          j                               t                      }t                      }g }t          |          D ]N}|rt          t          |||||f          }nt          t           |	||f          }|                    |           Ot#          |          D ]\  }}|                    ||f           |D ]+}|                    d            |                                 ,|                    t          j                               d gt)          |          z  }|D ]}|                                \  }}|||<   |                    t          j                               |D ]}|                                 |                    t          j                               t          |dd          5 }
|
                    d                    |                     d d d            n# 1 swxY w Y   |                    t          j                               t1          d		                    |d
         |d         z
                       |rg d}|r|dd          }|dd          }d|d<   t3          |d t3          |d d
         |dd                    D                       D ]*\  }}t1          d	                    ||                     )d S d S )Nspawnr   r   r#   r$   )targetargsr?   rW   r   rH   r   )r   r   
start_procr   	join_procr   r   zload_modal & word_segr'   c                     g | ]
\  }}||z
  S r   r   r   s      r   r   z$_test_multi_proc.<locals>.<listcomp>  r   r   r   )multiprocessingget_start_methodrI   r   rw   r   r   r   r   r   r1   r2   r   rB   r   r   r   r   r   r   r5   r   ri   r   r   r   )r   r   nthreadr   r   r   r   altr   r   r9   r:   r   r   procsr   pr~   r;   procresultr   r   r   s                           r   _test_multi_procr    sL    
*
,
,
7CE	LL
 4ZF33	LL7>>*%% L7>>zJJKKK	j#	0	0	0 A               
LLwwHIE7^^   	G  )VXyI  AA
 uC9+EFFFAQu%% " "	Tc4[!!!!  T

	LLVc%jj F  MMOO	Ts	LL  		LL	k3	1	1	1 #Q			&!!"""# # # # # # # # # # # # # # #	LL	

&
&uRy58';
<
<=== 4
 
 
	  	3!""IE!!""I2IaLFF3uSbSz59+E+EFFF
 
 	4 	4JC -&&sE223333'4 4	4 	4s$   C00C47C4)K==LL
   c           	      h    |dk    rt          | ||||||           d S t          | |||||           d S )Nr   )r  r   )r   r   r   r   r	  r   r   s          r   testr    sc     {{Wj)VW	
 	
 	
 	
 	
 	ZFG	
 	
 	
 	
 	
r   )r   N)ry   ry   FF)ry   ry   r  FF)&
__future__r   sysversion_infor   stderrexitr   r   pickler[   r  r   r   r   r	   r
   r   r   r   r   r   r   downloadr   r   r   r   r!   rT   rw   r   r   r   r   r   r  r  r   r   r   <module>r     s   % % % % % % 



A	E
+#*====CHQKKK 				          * * * * * * * *                   / / / / / /       $ $ $ $ $ $             X& X& X& X& X& X& X& X&t?= ?= ?= ?= ?= ?= ?= ?=BV V V V V V V Vr9 9 9 9> _d#4 #4 #4 #4L" " ") ) ) ) ) )$ S4 S4 S4 S4r 
 
 
 
 
 
r   