o
    rZhF:                     @   s8  d dl Z d dlZd dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZmZ d dlmZmZ dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Z					 		!	 	"	#	"	"						"	d0d$d%Zd&d' Zd1d(d)Zed*d+gZeg d,Z d-d. Z!e"d/kre  dS dS )2    N)treebank)BrillTaggerTrainerRegexpTaggerUnigramTagger)PosWord)Template
error_listc                   C   s
   t   dS )z
    Run a demo with defaults. See source comments for details,
    or docstrings of any of the more specific demo_* functions.
    Npostag r   r   </var/www/auris/lib/python3.10/site-packages/nltk/tbl/demo.pydemo   s   
r   c                   C      t dd dS )N
    Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
    repr
ruleformatNr
   r   r   r   r   demo_repr_rule_format      r   c                   C   r   )r   strr   Nr
   r   r   r   r   demo_str_rule_format$   r   r   c                   C   r   )z*
    Exemplify Rule.format("verbose")
    verboser   Nr
   r   r   r   r   demo_verbose_rule_format+   r   r   c                   C   s   t ttg dgd dS )a  
    The feature/s of a template takes a list of positions
    relative to the current word where the feature should be
    looked for, conceptually joined by logical OR. For instance,
    Pos([-1, 1]), given a value V, will hold whenever V is found
    one step to the left and/or one step to the right.

    For contiguous ranges, a 2-arg form giving inclusive end
    points can also be used: Pos(-3, -1) is the same as the arg
    below.
    )	templatesN)r   r   r   r   r   r   r   demo_multiposition_feature2   s   r   c                   C   s$   t ttdgtddggd dS )z8
    Templates can have more than a single feature.
    r   r   r   r   N)r   r   r   r   r   r   r   r   demo_multifeature_templateA   s   $r    c                   C   s   t ddd dS )ah  
    Show aggregate statistics per template. Little used templates are
    candidates for deletion, much used templates may possibly be refined.

    Deleting unused templates is mostly about saving time and/or space:
    training is basically O(T) in the number of templates T
    (also in terms of memory usage, which often will be the limiting factor).
    T)incremental_statstemplate_statsNr
   r   r   r   r   demo_template_statisticsH   s   	r#   c                  C   sj   t jg dddgdd} tjg dddgdd}ttj| |gdd	}td
t| t|ddd dS )a	  
    Template.expand and Feature.expand are class methods facilitating
    generating large amounts of templates. See their documentation for
    details.

    Note: training with 500 templates can easily fill all available
    even on relatively small corpora
    )r   r      r$      F)Zexcludezero)r   r   r   r$   T)r$      )combinationsz8Generated {} templates for transformation-based learning)r   r!   r"   N)	r   expandr   listr   printformatlenr   )ZwordtplsZtagtplsr   r   r   r   demo_generated_templatesT   s   	r-   c                   C   s   t dddd dS )z
    Plot a learning curve -- the contribution on tagging accuracy of
    the individual rules.
    Note: requires matplotlib
    Tzlearningcurve.png)r!   separate_baseline_datalearning_curve_outputNr
   r   r   r   r   demo_learning_curveh   s
   
r0   c                   C   r   )zW
    Writes a file with context for each erroneous word after tagging testing data
    z
errors.txt)error_outputNr
   r   r   r   r   demo_error_analysisu   r   r2   c                   C   r   )zm
    Serializes the learned tagger to a file in pickle format; reloads it
    and validates the process.
    z
tagger.pcl)serialize_outputNr
   r   r   r   r   demo_serialize_tagger|   s   r4   c                   C   s   t dddd dS )z
    Discard rules with low accuracy. This may hurt performance a bit,
    but will often produce rules which are more interesting read to a human.
    i  gQ?
   )	num_sentsmin_acc	min_scoreNr
   r   r   r   r   demo_high_accuracy_rules   s   r9     ,  r&   皙?Fr   c           &      C   s|  |pt }| du rddlm}m} | } t|||||\}}}}|rptj|sOt||d}t	|d}t
|| W d   n1 sCw   Y  td| t	|}t
|}td|  W d   n1 sjw   Y  n
t||d}td |rtd	|| t }t|| ||	d
}td |||||}tdt | dd |rtd||  |dkrtd t| dD ]\}}t|dd||	d q|
rtd |||\} }!td |std | }"|r||! |rt||!|"|d td|  ntd ||} |r|  |durXt	|d}#|#d|  |#dt|| dd  W d   n	1 sLw   Y  td|  |dur||} t	|d}t
|| W d   n	1 syw   Y  td|  t	|}t
|}$W d   n	1 sw   Y  td|  ||}%| |%krtd  dS td! dS dS )"a
  
    Brill Tagger Demonstration
    :param templates: how many sentences of training and testing data to use
    :type templates: list of Template

    :param tagged_data: maximum number of rule instances to create
    :type tagged_data: C{int}

    :param num_sents: how many sentences of training and testing data to use
    :type num_sents: C{int}

    :param max_rules: maximum number of rule instances to create
    :type max_rules: C{int}

    :param min_score: the minimum score for a rule in order for it to be considered
    :type min_score: C{int}

    :param min_acc: the minimum score for a rule in order for it to be considered
    :type min_acc: C{float}

    :param train: the fraction of the the corpus to be used for training (1=all)
    :type train: C{float}

    :param trace: the level of diagnostic tracing output to produce (0-4)
    :type trace: C{int}

    :param randomize: whether the training data should be a random subset of the corpus
    :type randomize: C{bool}

    :param ruleformat: rule output format, one of "str", "repr", "verbose"
    :type ruleformat: C{str}

    :param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
    :type incremental_stats: C{bool}

    :param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
    :type template_stats: C{bool}

    :param error_output: the file where errors will be saved
    :type error_output: C{string}

    :param serialize_output: the file where the learned tbl tagger will be saved
    :type serialize_output: C{string}

    :param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
    :type learning_curve_output: C{string}

    :param learning_curve_take: how many rules plotted
    :type learning_curve_take: C{int}

    :param baseline_backoff_tagger: the file where rules will be saved
    :type baseline_backoff_tagger: tagger

    :param separate_baseline_data: use a fraction of the training data exclusively for training baseline
    :type separate_baseline_data: C{bool}

    :param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                                  deterministic output from the baseline unigram tagger between python versions)
    :type cache_baseline_tagger: C{string}


    Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
    is fast and fine for a demo, but is likely to generalize worse on unseen data.
    Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
    Nr   )brill24describe_template_sets)backoffwz)Trained baseline tagger, pickled it to {}zReloaded pickled tagger from zTrained baseline taggerz!    Accuracy on test set: {:0.4f}r   zTraining tbl tagger...zTrained tbl tagger in z0.2fz secondsz    Accuracy on test set: %.4fr$   z
Learned rules: Z4d szJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez Wrote plot of learning curve to zTagging the test datazErrors for Brill Tagger %r


zutf-8z)Wrote tagger errors including context to zWrote pickled tagger to z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillr=   r>   _demo_prepare_dataospathexistsr   openpickledumpr*   r+   loadZaccuracytimer   train	enumeraterulesZbatch_tag_incrementalZtrain_statsZprint_template_statistics
_demo_plotZ	tag_sentswritejoinr	   encode)&r   tagged_datar6   Z	max_rulesr8   r7   rP   trace	randomizer   r!   r"   r1   r3   r/   Zlearning_curve_takeZbaseline_backoff_taggerr.   Zcache_baseline_taggerr=   r>   training_databaseline_data	gold_datatesting_dataZbaseline_taggerZprint_rulesZtbrillZtrainerZbrill_taggerZrulenoruleZ
taggedtest	teststats
trainstatsfZbrill_tagger_reloadedZtaggedtest_reloadedr   r   r   r      s   X





"




r   c                 C   s0  | d u rt d t } |d u st| |krt| }|r(tt|  t|  t|| }| d | }| || }dd |D }|sF|}	nt|d }
|d |
 ||
d  }	}t|\}}t|\}}t|	\}}t d|dd|dd t d	|dd|dd t d
	|||rdnd ||	||fS )Nz%Loading tagged data from treebank... c                 S   s   g | ]	}d d |D qS )c                 S   s   g | ]}|d  qS )r   r   ).0tr   r   r   
<listcomp>a  s    z1_demo_prepare_data.<locals>.<listcomp>.<listcomp>r   )rb   sentr   r   r   rd   a  s    z&_demo_prepare_data.<locals>.<listcomp>r&   zRead testing data (dz sents/z wds)zRead training data (z-Read baseline data ({:d} sents/{:d} wds) {:s} z[reused the training set])
r*   r   Ztagged_sentsr,   randomseedshuffleintcorpus_sizer+   )rW   rP   r6   rY   r.   cutoffrZ   r\   r]   r[   Z	bl_cutoffZ	trainseqsZtraintokensZtestseqsZ
testtokensZbltrainseqsZbltraintokensr   r   r   rG   Q  s>   



rG   c           	         s    d g} d D ]}| |d |  q	 fdd|d | D }d g}d D ]}| |d |  q+fdd|d | D }dd lm} ttt|}||||| |g d ||  d S )	NZinitialerrorsZ
rulescoresr   c                       g | ]
}d | d   qS r$   Z
tokencountr   rb   x)r_   r   r   rd   }      z_demo_plot.<locals>.<listcomp>c                    rn   ro   r   rp   )r`   r   r   rd     rr   r   )NNNg      ?)	appendZmatplotlib.pyplotZpyplotr)   ranger,   ZplotZaxisZsavefig)	r/   r_   r`   rC   Z	testcurveZ	rulescoreZ
traincurveZpltrr   )r_   r`   r   rS   y  s   

rS   z^-?[0-9]+(\.[0-9]+)?$ZCDz.*NN)	rv   )z(The|the|A|a|An|an)$AT)z.*able$ZJJ)z.*ness$rx   )z.*ly$ZRB)z.*s$ZNNS)z.*ing$ZVBG)z.*ed$ZVBDrw   c                 C   s   t | tdd | D fS )Nc                 s   s    | ]}t |V  qd S )N)r,   rp   r   r   r   	<genexpr>  s    zcorpus_size.<locals>.<genexpr>)r,   sum)Zseqsr   r   r   rl     s   rl   __main__)NNr:   r;   r&   Nr<   r&   Fr   FFNNNr;   NFN)NN)#rH   rL   rh   rO   Znltk.corpusr   Znltk.tagr   r   r   rF   r   r   Znltk.tblr   r	   r   r   r   r   r   r    r#   r-   r0   r2   r4   r9   r   rG   rS   ZNN_CD_TAGGERrE   rl   __name__r   r   r   r   <module>   sf   		
 F
(
