o
    rZh~0                     @   s  d Z ddlZddlZddlmZ ddlmZmZ zddl	m
Z
 W n	 ey)   Y nw ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ G dd deZG dd deZdd Zdd Zdd Zd#ddZdd Zdd ZG dd deZd$d d!Z	 e d"kred ed dS dS )%z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScorefind)word_tokenize)Treec                   @   s2   e Zd ZdZdddZdd Zdd Zd	d
 ZdS )NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    Nc                 C   s   t j| || j|d d S )N)trainZclassifier_builder
classifier)r   __init___classifier_builder)selfr   r    r   F/var/www/auris/lib/python3.10/site-packages/nltk/chunk/named_entity.pyr   $   s   
zNEChunkParserTagger.__init__c                 C   s   t j|ddddS )NZiis      )	algorithmZgaussian_prior_sigmatrace)r   r   r   r   r   r   r   r   ,   s   z'NEChunkParserTagger._classifier_builderc                 C   sD   z| j }W |S  ty!   ddlm} t|d| _ | j }Y |S w )Nr   )wordszen-basic)Z_en_wordlistAttributeErrorZnltk.corpusr   set)r   Zwlr   r   r   r   _english_wordlist5   s   z%NEChunkParserTagger._english_wordlistc                 C   sj  || d }t || d }|dkr!d  }}d  }}	d  }
 }}na|dkrJ||d  d  }d }t ||d  d }d }	||d  d }d  }
}n8||d  d  }||d  d  }t ||d  d }t ||d  d }	||d  }||d  }t|}
|t|d krd  }}d  }}nI|t|d kr||d  d  }||d  d  }d }d }n(||d  d  }||d  d  }||d  d  }||d  d  }i dddt|dt|d|d d	  d
|dd   d|d|d||  v d|d|d|d|d|d|  d| d| d| d|
 d| }|S )Nr   r   r   ZbiasTshapeZwordlenZprefix3   Zsuffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr   lenr   )r   tokensindexhistoryr!   r    r%   Zprevprevwordr#   ZprevprevposZ	prevshaper"   Zprevprevtagr&   Znextnextwordr$   Znextnextposfeaturesr   r   r   _feature_detector?   s   

	
z%NEChunkParserTagger._feature_detector)NN)__name__
__module____qualname____doc__r   r   r   r/   r   r   r   r   r      s    
	
r   c                   @   s<   e Zd ZdZdd Zdd Zdd Zdd	 Zed
d Z	dS )NEChunkParser2
    Expected input: list of pos-tagged words
    c                 C   s   |  | d S N)_trainr   r   r   r   r      s   zNEChunkParser.__init__c                 C   s   | j |}| |}|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r+   Ztaggedtreer   r   r   parse   s   
zNEChunkParser.parsec                    s"    fdd|D }t |d _d S )Nc                    s   g | ]}  |qS r   )_parse_to_tagged).0sr   r   r   
<listcomp>       z(NEChunkParser._train.<locals>.<listcomp>)r   )r   r8   )r   Zcorpusr   r@   r   r7      s   zNEChunkParser._trainc                 C   s   t dg }|D ]P\}}|dkr|| q|dr(|t |dd |g q|drW|rJt|d t rJ|d  |dd krJ|d | q|t |dd |g q|S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   Ztagged_tokenssenttokr9   r   r   r   r:      s   


*zNEChunkParser._tagged_to_parsec                 C   s   g }| D ]=}t |tr:t|dkrtd q||d d|  f |dd D ]}||d|  f q*q||df q|S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencerE   r   NrF   rD   )rJ   r   r*   printrH   rK   )rL   tokschildrM   r   r   r   r=      s   
zNEChunkParser._parse_to_taggedN)
r0   r1   r2   r3   r   r<   r7   r:   staticmethodr=   r   r   r   r   r4   z   s    r4   c                 C   sX   t d| t jr
dS t d| t jrdS t d| t jr*|  r"dS |  r(dS dS d	S )
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$ZupcaseZdowncaseZ	mixedcaseother)rematchUNICODEistitleislower)r!   r   r   r   r      s   r   c                 C   s   |  drdS | dd S )NV-r   )rI   split)r?   r   r   r   r(      s   
r(   c                 C   s   |   }dd t|D }tdg }| D ]+}t|tr6|t| g  |D ]}|d |t|f q'q||t|f q|S )Nc                 s   s    | ]\}}|V  qd S r6   r   )r>   r!   r    r   r   r   	<genexpr>   s    zpostag_tree.<locals>.<genexpr>rC   rG   )leavesr   r   rJ   rH   rK   next)r;   r   Ztag_iterZnewtreerP   Zsubchildr   r   r   postag_tree   s   

r`   binaryTc                 c   sd    | D ],}t |D ]$\}}}|dr|rq
|D ]}|dr-tt j|||E d H  qq
qd S )NZbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmtZ
skip_bnewsrootdirsfilesfr   r   r   load_ace_data   s   
rn   c                 c   s   t dtj| d   | d }g }t|}t| }W d    n1 s*w   Y  |dD ]2}|	dj
}|dD ]$}|ddkrKqAt|	d	j
}	t|	d
j
d }
||	|
|f qAq4t| }| }W d    n1 szw   Y  tdd|}dd }td||}tdd|}tdd|}tdd|}dd |D }|dkrd}tdg }t|D ]+\}	}
}|	|k r|}	|
|	krq|t|||	  |td||	|
   |
}q|t||d   |V  d S |dkrHd}tdg }t|D ]/\}	}
}|	|k r|}	|
|	krq|t|||	  |t|||	|
   |
}q|t||d   |V  d S td)Nz  - r   z.tmx.rdc.xmlzdocument/entityZentity_typeZentity_mentionZTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 S   s   d|   |   d  S )N    )endstart)mr   r   r   subfunc   s   zload_ace_file.<locals>.subfuncz[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" c                 S   s   h | ]\}}}|qS r   r   )r>   r?   etypr   r   r   	<setcomp>  rB   z load_ace_file.<locals>.<setcomp>ra   r   rC   ZNE
multiclasszbad fmt value)rN   rb   rf   r\   openETr<   Zgetrootfindallr	   textgetintrH   readrU   subr   sortedextendr
   
ValueError)Ztextfileri   Zannfileentitiesinfilexmlentityrx   Zmentionr?   rw   r~   rv   Zentity_typesirO   r   r   r   re      sj   









re   c                 C   s   t | } t |}d}t| |D ]B\\}}\}}||  kr#dkrBn n|sAtd|dd|dd|  tdddd d}qd}td|dd|dd|  qd S )	NFrD   z  Z15rq   z  {:15} {:15} {2}z...T)r4   r=   ziprN   format)ZcorrectZguessedellipsiswctgtr   r   r   
cmp_chunks.  s   

 r   c                   @   s*   e Zd ZdZd
ddZdd Zdd Zd	S )Maxent_NE_Chunkerr5   rz   c                 C   s0   ddl m} || _|d| d| _|   d S )Nr   r   z+chunkers/maxent_ne_chunker_tab/english_ace_/)	nltk.datar	   _fmt_tab_dirload_params)r   ri   r	   r   r   r   r   E  s   zMaxent_NE_Chunker.__init__c                 C   sF   ddl m}m} || j\}}}}t||||d|}t|d| _d S )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)Zalwayson_features)r   )nltk.classify.maxentr   r   r   r   r   r8   )r   r   r   wgtmpglabaonZmcr   r   r   r   L  s   zMaxent_NE_Chunker.load_paramsc           	      C   sV   ddl m} | jj}|j}|j}|j}|j}|j}| j	}|||||d| dd d S )Nr   )save_maxent_paramsz/tmp/english_ace_r   )Ztab_dir)
r   r   r8   Z_classifier	_encodingZ_weights_mappingZ_labelsZ	_alwaysonr   )	r   r   ZclassifZecgr   r   r   r   ri   r   r   r   save_paramsU  s   zMaxent_NE_Chunker.save_paramsNrz   )r0   r1   r2   r3   r   r   r   r   r   r   r   r   @  s
    
	r   rz   c                 C   s   t | }|  |S r6   )r   r   )ri   Zchunkerr   r   r   build_modelb  s   r   __main__)ra   Tr   )!r3   rb   rU   Z	xml.etreer   r|   Znltk.tagr   r   Znltk.classifyr   ImportErrorZnltk.chunk.apir   Znltk.chunk.utilr   r   r	   Znltk.tokenizer
   Z	nltk.treer   r   r4   r   r(   r`   rn   re   r   r   r   r0   r   r   r   r   <module>   s<   [;

I
"*