
    /h~0                     :   S r SSKrSSKrSSKJr  SSKJrJr   SSK	J
r
  SSKJr  SSKJr  SSKJr  SS	KJr  SS
KJr   " S S\5      r " S S\5      rS rS rS rSS jrS rS r " S S\5      rSS jr \ S:X  a  \" S5        \" S5        gg! \ a     Nyf = f)z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScorefind)word_tokenize)Treec                   4    \ rS rSrSrS	S jrS rS rS rSr	g)
NEChunkParserTagger   z*
The IOB tagger used by the chunk parser.
Nc                 F    [         R                  " U UU R                  US9  g )N)trainclassifier_builder
classifier)r   __init___classifier_builder)selfr   r   s      O/var/www/auris/envauris/lib/python3.13/site-packages/nltk/chunk/named_entity.pyr   NEChunkParserTagger.__init__$   s"    &&#77!		
    c                 0    [         R                  " USSSS9$ )Niis      )	algorithmgaussian_prior_sigmatrace)r   r   r   r   s     r   r   'NEChunkParserTagger._classifier_builder,   s!    %%!"
 	
r   c                      U R                   nU$ ! [         a5    SSKJn  [	        UR                  S5      5      U l         U R                   n U$ f = f)Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr$   set)r   wlr$   s      r   _english_wordlist%NEChunkParserTagger._english_wordlist5   sS    	#""B 	  	#) #EKK
$; <D""B		#s    ;AAc                 d   X   S   n[        X   S   5      nUS:X  a  S =pgS =pS =n
=pOUS:X  a=  XS-
     S   R                  5       nS n[        XS-
     S   5      nS n	X2S-
     S   nS =pOoXS-
     S   R                  5       nXS-
     S   R                  5       n[        XS-
     S   5      n[        XS-
     S   5      n	X2S-
     nX2S-
     n[        U5      n
U[        U5      S-
  :X  a  S =pS =nnOU[        U5      S-
  :X  a5  XS-      S   R                  5       nXS-      S   R                  5       nS nS nO`XS-      S   R                  5       nXS-      S   R                  5       nXS-      S   R                  5       nXS-      S   R                  5       n0 SS_S[        U5      _S[        U5      _SUS S	 R                  5       _S
USS  R                  5       _SU_SU_SX@R	                  5       ;   _SU_SU_SU_SU_SU_SUR                  5        SU 3_SU SU 3_SU
 SU 3_nU$ )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr.   lenr*   )r   tokensindexhistoryr5   r4   r9   prevprevwordr7   prevprevpos	prevshaper6   prevprevtagr:   nextnextwordr8   nextnextposfeaturess                     r   _feature_detector%NEChunkParserTagger._feature_detector?   s   }Q6=+,A:&**H$((G044I4+aZai(+113HL"6!)#4Q#78GKai(+G&**Iai(+113H!!),Q/557L"6!)#4Q#78G&vai'8';<Kai(G!!),KhICK!O#&**H$((Gkc&kAo%ai(+113HQY'*002GLKai(+113HQY'*002G!!),Q/557L +A.446K
D
U4[
 s4y
 tBQx~~'	

 tBCy(
 3
 D
 D$:$:$<<
 w
 w
 w
 
 
 tzz|nAgY7
 cU!G9-
  	{!G95!
& r   )r%   )NN)
__name__
__module____qualname____firstlineno____doc__r   r   r*   rI   __static_attributes__ r   r   r   r      s    

8r   r   c                   @    \ rS rSrSrS rS rS rS r\	S 5       r
Srg	)
NEChunkParserz   *
Expected input: list of pos-tagged words
c                 &    U R                  U5        g N)_trainr!   s     r   r   NEChunkParser.__init__   s    Er   c                 ^    U R                   R                  U5      nU R                  U5      nU$ )z(
Each token should be a pos-tagged word
)_taggertag_tagged_to_parse)r   r?   taggedtrees       r   parseNEChunkParser.parse   s-     !!&)$$V,r   c                 j    U Vs/ s H  o R                  U5      PM     nn[        US9U l        g s  snf )N)r   )_parse_to_taggedr   r[   )r   corpusss      r   rX   NEChunkParser._train   s0    4:;Fq''*F;*8 <s   0c                    [        S/ 5      nU H  u  p4US:X  a  UR                  U5        M  UR                  S5      (       a!  UR                  [        USS U/5      5        MU  UR                  S5      (       d  Mm  U(       aH  [        US   [         5      (       a0  US   R	                  5       USS :X  a  US   R                  U5        M  UR                  [        USS U/5      5        M     U$ )z8
Convert a list of tagged tokens to a chunk-parse tree.
SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokr\   s        r   r]   NEChunkParser._tagged_to_parse   s     C}%HCczC %%DQR3%01%%JtBx6648>>;KsSTSUw;VHOOC(KKSWse 45 & r   c                 V   / nU  H  n[        U[        5      (       au  [        U5      S:X  a  [        S5        M4  UR	                  US   SUR                  5        345        USS  H'  nUR	                  USUR                  5        345        M)     M  UR	                  US45        M     U$ )z8
Convert a chunk-parse tree to a list of tagged tokens.
r   z"Warning -- empty chunk in sentencerj   r   Nrk   ri   )ro   r   r>   printrm   rp   )rr   tokschildrs   s       r   rc   NEChunkParser._parse_to_tagged   s    
 E%&&u:?>?U1XEKKM?';<= 9CKK5;;=/&: ;< % UCL)  r   )r[   N)rK   rL   rM   rN   rO   r   r`   rX   r]   staticmethodrc   rP   rQ   r   r   rS   rS   z   s/    9$  r   rS   c                 d   [         R                  " SU [         R                  5      (       a  g[         R                  " SU [         R                  5      (       a  g[         R                  " SU [         R                  5      (       a-  U R                  5       (       a  gU R	                  5       (       a  ggg	)
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r5   s    r   r.   r.      sp    	xx4dBJJGG	'4	,	,	'4	,	,<<>>\\^^r   c                 X    U R                  S5      (       a  gU R                  S5      S   $ )NV-r   )rn   split)re   s    r   r<   r<      s&    ||Cwws|Ar   c                 |   U R                  5       nS [        U5       5       n[        S/ 5      nU  H  n[        U[        5      (       aS  UR	                  [        UR                  5       / 5      5        U H"  nUS   R	                  U[        U5      45        M$     Mk  UR	                  U[        U5      45        M     U$ )Nc              3   *   #    U  H	  u  pUv   M     g 7frW   rQ   ).0r5   r4   s      r   	<genexpr>postag_tree.<locals>.<genexpr>   s     6~~s   rh   rl   )leavesr   r   ro   rm   rp   next)r_   r$   tag_iternewtreerx   subchilds         r   postag_treer      s    KKME6wu~6H3mGeT""NN4r23!""Hd8n#=> " NNE4>23  Nr   binaryc           	   #   D  #    U  H  n[         R                  " U5       Hw  u  p4nUR                  S5      (       a	  U(       a  M%  U HL  nUR                  S5      (       d  M  [        [         R                  R                  X65      U5       S h  vN   MN     My     M     g  N7f)Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfs          r   load_ace_datar      sv     !#D}}W%%*::f%%,RWW\\$-BCHHH  "/  Is   AB  ,B B
B c           	   #     #    [        S[        R                  R                  U 5      S    35        U S-   n/ n[	        U5       n[
        R                  " U5      R                  5       nS S S 5        WR                  S5       H  nUR                  S5      R                  nUR                  S5       Hx  nUR                  S5      S:w  a  M  [        UR                  S	5      R                  5      n	[        UR                  S
5      R                  5      S-   n
UR                  XU45        Mz     M     [	        U 5       nUR                  5       nS S S 5        [        R                   " SSW5      nS n[        R                   " SX5      n[        R                   " SSU5      n[        R                   " SSU5      n[        R                   " SSU5      nU V	V
Vs1 s H  u  powiM	     nn
n	nUS:X  a  Sn[#        S/ 5      n[%        U5       H]  u  pnX:  a  Un	X::  a  M  UR'                  [)        XU	 5      5        UR                  [#        SXU
 R                  5       5      5        U
nM_     UR'                  [)        XS  5      5        Uv   g US:X  a  Sn[#        S/ 5      n[%        U5       H\  u  pnX:  a  Un	X::  a  M  UR'                  [)        XU	 5      5        UR                  [#        X{X R                  5       5      5        U
nM^     UR'                  [)        XS  5      5        Uv   g [+        S5      e! , (       d  f       GN= f! , (       d  f       GN= fs  snn
n	f 7f)Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 P    SU R                  5       U R                  5       -
  S-
  -  $ )N    )endstart)ms    r   subfuncload_ace_file.<locals>.subfunc   s#    aeeg	)A-..r   z[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" r   r   rh   NE
multiclasszbad fmt value)rv   r   r   r   openETr`   getrootfindallr
   textgetintrm   readr   subr   sortedextendr   
ValueError)textfiler   annfileentitiesinfilexmlentitytypmentionre   er   r   entity_typesirw   s                   r   r   r      s    	Dx(+,
-.'G H	g&hhv&&( 
++/0kk-(--~~&67G{{6"f,GLL!56;;<AGLL!3499:Q>AOOQ3K( 8 1 
h6{{} 
 66%r40D/ 66"G2D66#R.D 66$d#D66$d#D+348KQ3C8L4 hC})IA#uvKKdQi01KKT4!9??#456A * 	M$r(+,
 
	C})IA#uvKKdQi01KKSq)//"345A * 	M$r(+,
 ))} 
 
" 5sE   AM%L2(CM;MBMM"EM2
M<M
MMc           	      Z   [         R                  U 5      n [         R                  U5      nSn[        X5       Hp  u  u  p4u  p5XEs=:X  a  S:X  aB  O  O?U(       d6  [        SUS SUS SU 35        [        SR	                  SSS5      5        SnMV  MX  Sn[        SUS SUS SU 35        Mr     g )	NFri   z  15r   z  {:15} {:15} {2}z...T)rS   rc   ziprv   format)correctguessedellipsiswctgts         r   
cmp_chunksr   .  s    ,,W5G,,W5GH1!?s?2b'2b'1#./)00uEF 
 HBr"gQr"gQqc*+ 2r   c                   .    \ rS rSrSrSS jrS rS rSrg)	Maxent_NE_Chunkeri@  rU   c                 ^    SSK Jn  Xl        U" SU S35      U l        U R	                  5         g )Nr   r	   z+chunkers/maxent_ne_chunker_tab/english_ace_/)	nltk.datar
   _fmt_tab_dirload_params)r   r   r
   s      r   r   Maxent_NE_Chunker.__init__E  s,    "	J3%qQRr   c                 |    SSK JnJn  U" U R                  5      u  p4pV[	        U" XTUS9U5      n[        US9U l        g )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)alwayson_features)r   )nltk.classify.maxentr   r   r   r   r   r[   )r   r   r   wgtmpglabaonmcs           r   r   Maxent_NE_Chunker.load_paramsL  s<    X/>#'CH#
 +b9r   c           	          SSK Jn  U R                  R                  nUR                  nUR
                  nUR                  nUR                  nUR                  nU R                  nU" XEXgSU S3S9  g )Nr   )save_maxent_paramsz/tmp/english_ace_r   )tab_dir)
r   r   r[   _classifier	_encoding_weights_mapping_labels	_alwaysonr   )	r   r   classifecgr   r   r   r   r   s	            r   save_paramsMaxent_NE_Chunker.save_paramsU  sd    ;,,**llkkmmii3S9J3%q7QRr   )r   r   r[   Nr   )	rK   rL   rM   rN   rO   r   r   r   rP   rQ   r   r   r   r   @  s    :
Sr   r   r   c                 <    [        U 5      nUR                  5         U$ rW   )r   r   )r   chunkers     r   build_modelr   b  s    $GNr   __main__)r   Tr   )!rO   r   r   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   r   r
   nltk.tokenizer   	nltk.treer   r   rS   r.   r<   r   r   r   r   r   r   rK   rQ   r   r   <module>r     s    
 	 ' 3	. ( &  ' X/ Xv8L 8v ID*R,$S SD(T z   		s   B BB