
    /hF:                     6   S SK r S SKrS SKrS SKrS SKJr  S SKJrJrJ	r	  S SK
JrJr  S SKJrJr  S rS rS rS	 rS
 rS rS rS rS rS rS rS r                   SS jrS rSS jr\" SS/5      r\" / SQ5      r S r!\"S:X  a  \" 5         gg)    N)treebank)BrillTaggerTrainerRegexpTaggerUnigramTagger)PosWord)Template
error_listc                      [        5         g)zx
Run a demo with defaults. See source comments for details,
or docstrings of any of the more specific demo_* functions.
Npostag     E/var/www/auris/envauris/lib/python3.13/site-packages/nltk/tbl/demo.pydemor      s	    
 Hr   c                      [        SS9  g)F
Exemplify repr(Rule) (see also str(Rule) and Rule.format("verbose"))
repr
ruleformatNr   r   r   r   demo_repr_rule_formatr      s     fr   c                      [        SS9  g)r   strr   Nr   r   r   r   demo_str_rule_formatr   $   s     er   c                      [        SS9  g)z"
Exemplify Rule.format("verbose")
verboser   Nr   r   r   r   demo_verbose_rule_formatr   +   s     i r   c                  @    [        [        [        / SQ5      5      /S9  g)a  
The feature/s of a template takes a list of positions
relative to the current word where the feature should be
looked for, conceptually joined by logical OR. For instance,
Pos([-1, 1]), given a value V, will hold whenever V is found
one step to the left and/or one step to the right.

For contiguous ranges, a 2-arg form giving inclusive end
points can also be used: Pos(-3, -1) is the same as the arg
below.
)	templatesN)r   r	   r   r   r   r   demo_multiposition_featurer$   2   s     hs<0123r   c            	      V    [        [        [        S/5      [        SS/5      5      /S9  g)z0
Templates can have more than a single feature.
r   r    r!   r"   N)r   r	   r   r   r   r   r   demo_multifeature_templater&   A   s$     htQCy#r2h-89:r   c                      [        SSS9  g)aP  
Show aggregate statistics per template. Little used templates are
candidates for deletion, much used templates may possibly be refined.

Deleting unused templates is mostly about saving time and/or space:
training is basically O(T) in the number of templates T
(also in terms of memory usage, which often will be the limiting factor).
T)incremental_statstemplate_statsNr   r   r   r   demo_template_statisticsr*   H   s     T$7r   c                     [         R                  " / SQSS/SS9n [        R                  " / SQSS/SS9n[        [        R                  " X/SS	95      n[        S
R                  [        U5      5      5        [        USSS9  g)z
Template.expand and Feature.expand are class methods facilitating
generating large amounts of templates. See their documentation for
details.

Note: training with 500 templates can easily fill all available
even on relatively small corpora
)r!   r      r,      F)excludezero)r    r!   r   r,   T)r,      )combinationsz8Generated {} templates for transformation-based learning)r#   r(   r)   N)	r   expandr   listr	   printformatlenr   )wordtplstagtplsr#   s      r   demo_generated_templatesr8   T   ss     {{:1v5AHjj!QTBGX__h%8vNOI	BII	N	

 Y$tLr   c                      [        SSSS9  g)zr
Plot a learning curve -- the contribution on tagging accuracy of
the individual rules.
Note: requires matplotlib
Tzlearningcurve.png)r(   separate_baseline_datalearning_curve_outputNr   r   r   r   demo_learning_curver<   h   s     #1r   c                      [        SS9  g)zO
Writes a file with context for each erroneous word after tagging testing data
z
errors.txt)error_outputNr   r   r   r   demo_error_analysisr?   u   s     %r   c                      [        SS9  g)za
Serializes the learned tagger to a file in pickle format; reloads it
and validates the process.
z
tagger.pcl)serialize_outputNr   r   r   r   demo_serialize_taggerrB   |   s    
 L)r   c                      [        SSSS9  g)z
Discard rules with low accuracy. This may hurt performance a bit,
but will often produce rules which are more interesting read to a human.
i  gQ?
   )	num_sentsmin_acc	min_scoreNr   r   r   r   demo_high_accuracy_rulesrH      s    
 T426r   c           	         U=(       d    [         nU c  SSKJnJn  U" 5       n [	        XX(U5      u  nnnnU(       a  [
        R                  R                  U5      (       dP  [        UUS9n[        US5       n[        R                  " UU5        SSS5        [        SR                  U5      5        [        U5       n[        R                  " U5      n[        SU 35        SSS5        O[        UUS9n[        S5        U(       a)  [        S	R                  WR                  U5      5      5        [         R                   " 5       n[#        WXU	S
9n[        S5        UR%                  UX4U5      n[        S[         R                   " 5       U-
  S S35        U(       a  [        SUR                  U5      -  5        US:X  aP  [        S5        ['        UR)                  5       S5       H'  u  nn[        US SUR                  U	5      S 35        M)     U
(       a  [        S5        UR+                  UU5      u  n n![        S5        U(       d  [        S5        UR-                  5       n"U(       a  UR/                  U!5        U(       a  [1        UU!U"US9  [        SU 35        O3[        S5        UR3                  U5      n U(       a  UR/                  5         Ubs  [        US5       n#U#R5                  SU-  5        U#R5                  SR7                  [9        UU 5      5      R;                  S5      S-   5        SSS5        [        SU 35        Ub  UR3                  U5      n [        US5       n[        R                  " UU5        SSS5        [        SU 35        [        U5       n[        R                  " U5      n$SSS5        [        SU 35        UR3                  U5      n%U U%:X  a  [        S 5        g[        S!5        gg! , (       d  f       GNr= f! , (       d  f       GN= f! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       N= f)"a	  
Brill Tagger Demonstration
:param templates: how many sentences of training and testing data to use
:type templates: list of Template

:param tagged_data: maximum number of rule instances to create
:type tagged_data: C{int}

:param num_sents: how many sentences of training and testing data to use
:type num_sents: C{int}

:param max_rules: maximum number of rule instances to create
:type max_rules: C{int}

:param min_score: the minimum score for a rule in order for it to be considered
:type min_score: C{int}

:param min_acc: the minimum score for a rule in order for it to be considered
:type min_acc: C{float}

:param train: the fraction of the the corpus to be used for training (1=all)
:type train: C{float}

:param trace: the level of diagnostic tracing output to produce (0-4)
:type trace: C{int}

:param randomize: whether the training data should be a random subset of the corpus
:type randomize: C{bool}

:param ruleformat: rule output format, one of "str", "repr", "verbose"
:type ruleformat: C{str}

:param incremental_stats: if true, will tag incrementally and collect stats for each rule (rather slow)
:type incremental_stats: C{bool}

:param template_stats: if true, will print per-template statistics collected in training and (optionally) testing
:type template_stats: C{bool}

:param error_output: the file where errors will be saved
:type error_output: C{string}

:param serialize_output: the file where the learned tbl tagger will be saved
:type serialize_output: C{string}

:param learning_curve_output: filename of plot of learning curve(s) (train and also test, if available)
:type learning_curve_output: C{string}

:param learning_curve_take: how many rules plotted
:type learning_curve_take: C{int}

:param baseline_backoff_tagger: the file where rules will be saved
:type baseline_backoff_tagger: tagger

:param separate_baseline_data: use a fraction of the training data exclusively for training baseline
:type separate_baseline_data: C{bool}

:param cache_baseline_tagger: cache baseline tagger to this file (only interesting as a temporary workaround to get
                              deterministic output from the baseline unigram tagger between python versions)
:type cache_baseline_tagger: C{string}


Note on separate_baseline_data: if True, reuse training data both for baseline and rule learner. This
is fast and fine for a demo, but is likely to generalize worse on unseen data.
Also cannot be sensibly used for learning curves on training data (the baseline will be artificially high).
Nr   )brill24describe_template_sets)backoffwz)Trained baseline tagger, pickled it to {}zReloaded pickled tagger from zTrained baseline taggerz!    Accuracy on test set: {:0.4f}r   zTraining tbl tagger...zTrained tbl tagger in z0.2fz secondsz    Accuracy on test set: %.4fr,   z
Learned rules: 4d szJIncrementally tagging the test data, collecting individual rule statisticsz    Rule statistics collectedzbWARNING: train_stats asked for separate_baseline_data=True; the baseline will be artificially high)takez Wrote plot of learning curve to zTagging the test datazErrors for Brill Tagger %r


zutf-8z)Wrote tagger errors including context to zWrote pickled tagger to z4Reloaded tagger tried on test set, results identicalz;PROBLEM: Reloaded tagger gave different results on test set)REGEXP_TAGGERnltk.tag.brillrJ   rK   _demo_prepare_dataospathexistsr   openpickledumpr3   r4   loadaccuracytimer   train	enumeraterulesbatch_tag_incrementaltrain_statsprint_template_statistics
_demo_plot	tag_sentswritejoinr
   encode)&r#   tagged_datarE   	max_rulesrG   rF   r_   trace	randomizer   r(   r)   r>   rA   r;   learning_curve_takebaseline_backoff_taggerr:   cache_baseline_taggerrJ   rK   training_databaseline_data	gold_datatesting_databaseline_taggerprint_rulestbrilltrainerbrill_taggerrulenorule
taggedtest	teststats
trainstatsfbrill_tagger_reloadedtaggedtest_reloadeds&                                         r   r   r      s   p 6FB
 I	>PI2H?;]M9l ww~~344+'>O +S1[O[9 2;BB)
 '(K$kk+6O12G1HIJ )( (?VW'(/66((3	
 YY[F jG 

"#==	gNL	"499;#7"=X
FG.1F1Fy1QQR z!"%l&8&8&:A>LFDVBKqZ!8 ;<= ?
 X	
 #/"D"D)#
Y 	-.%, "--/
229= %y*CV 45J4KLM%&!++L9
224 ,$GG47GGHGGDIIjJ?@GGPSWWX % 	9,HI #!++L9
"C(KKKk2 )()9(:;<"#{$*KK$<! $-.>-?@A*44\B,,HIOP $U 21 )(z %$ )( $#s=   5O8:%P
AP"P-P>8
P

P
P*-
P;>
Qc           	         U c   [        S5        [        R                  " 5       n Ub  [        U 5      U::  a  [        U 5      nU(       a5  [        R
                  " [        U 5      5        [        R                  " U 5        [        X!-  5      nU S U nXU nU VV	s/ s H  o V	s/ s H  oS   PM	     sn	PM     n
nn	U(       d  UnO[        U5      S-  nUS U XlS  pk[        U5      u  p[        U
5      u  nn[        U5      u  nn[        SUS SUS S35        [        SUS SUS S35        [        S	R                  UUU(       a  S
OS5      5        XkXz4$ s  sn	f s  sn	nf )Nz%Loading tagged data from treebank... r   r/   zRead testing data (dz sents/z wds)zRead training data (z-Read baseline data ({:d} sents/{:d} wds) {:s} z[reused the training set])
r3   r   tagged_sentsr5   randomseedshuffleintcorpus_sizer4   )rj   r_   rE   rm   r:   cutoffrq   rs   senttrt   rr   	bl_cutoff	trainseqstraintokenstestseqs
testtokensbltrainseqsbltraintokenss                      r   rU   rU   Q  sq   
 56++-C,	9$	C$%{#"#F(M9-I5>?YT4(4aqT4(YL?!%&!+	*9%*% &  +=9Y(6Xz#.}#= [-	|7:a.
FG	 1W[O5
IJ	7>>(B.I	
 )BB+ )?s   	E!E/EEc                    US   /nUS    H  nUR                  US   U-
  5        M     US U  Vs/ s H  nSXaS   -  -
  PM     nnUS   /nUS    H  nUR                  US   U-
  5        M     US U  Vs/ s H  nSXbS   -  -
  PM     nnSS KJn  [        [	        [        U5      5      5      n	UR                  XX5        UR                  / SQ5        UR                  U 5        g s  snf s  snf )Ninitialerrors
rulescoresr!   r,   
tokencountr   )NNNg      ?)	appendmatplotlib.pyplotpyplotr2   ranger5   plotaxissavefig)
r;   r}   r~   rQ   	testcurve	rulescorex
traincurvepltrs
             r   re   re   y  s   ?+,I|,	223 -:CET:JK:JQQ<000:JIK_-.J-	*R.945 .<Fu<MN<Mq!a\222<MJN#U3y>"#AHHQ1)HH$%KK%& L
 Os   C'7C,z^-?[0-9]+(\.[0-9]+)?$CDz.*NN)	r   )z(The|the|A|a|An|an)$AT)z.*able$JJ)z.*ness$r   )z.*ly$RB)z.*s$NNS)z.*ing$VBG)z.*ed$VBDr   c                 <    [        U 5      [        S U  5       5      4$ )Nc              3   8   #    U  H  n[        U5      v   M     g 7f)N)r5   ).0r   s     r   	<genexpr>corpus_size.<locals>.<genexpr>  s     04a3q664s   )r5   sum)seqss    r   r   r     s    Is040011r   __main__)NNi  ,  r/   Ng?r/   Fr   FFNNNr   NFN)NN)#rV   rZ   r   r^   nltk.corpusr   nltk.tagr   r   r   rT   r   r   nltk.tblr	   r
   r   r   r   r   r$   r&   r*   r8   r<   r?   rB   rH   r   rU   re   NN_CD_TAGGERrS   r   __name__r   r   r   <module>r      s    
      D D $ )!4;	8M(
&*7 

  'BQJ%CP'& =}MN
2 z r   