
    /h                        S r  SSKrSSKrSSKrSSKJr  SSKJr  SSK	J
r
JrJr  SSKJrJrJr  SSKJrJrJr  SSKJr  SS	KJr  SS
KJr  Sr " S S\5      r\r " S S5      r " S S\5      r " S S\5      r  " S S\ 5      r! " S S\ 5      r" " S S\5      r# S(S jr$S r%S r& S(S jr'S r(S r) S)S  jr* " S! S"\5      r+S# r,S*S$ jr-S% r.S& r/\0S':X  a  \/" 5         gg! \ a     Nf = f)+a  
A classifier model based on maximum entropy modeling framework.  This
framework considers all of the probability distributions that are
empirically consistent with the training data; and chooses the
distribution with the highest entropy.  A probability distribution is
"empirically consistent" with a set of training data if its estimated
frequency with which a class and a feature vector value co-occur is
equal to the actual frequency in the data.

Terminology: 'feature'
======================
The term *feature* is usually used to refer to some property of an
unlabeled token.  For example, when performing word sense
disambiguation, we might define a ``'prevword'`` feature whose value is
the word preceding the target word.  However, in the context of
maxent modeling, the term *feature* is typically used to refer to a
property of a "labeled" token.  In order to prevent confusion, we
will introduce two distinct terms to disambiguate these two different
concepts:

  - An "input-feature" is a property of an unlabeled token.
  - A "joint-feature" is a property of a labeled token.

In the rest of the ``nltk.classify`` module, the term "features" is
used to refer to what we will call "input-features" in this module.

In literature that describes and discusses maximum entropy models,
input-features are typically called "contexts", and joint-features
are simply referred to as "features".

Converting Input-Features to Joint-Features
-------------------------------------------
In maximum entropy models, joint-features are required to have numeric
values.  Typically, each input-feature ``input_feat`` is mapped to a
set of joint-features of the form:

|   joint_feat(token, label) = { 1 if input_feat(token) == feat_val
|                              {      and label == some_label
|                              {
|                              { 0 otherwise

For all values of ``feat_val`` and ``some_label``.  This mapping is
performed by classes that implement the ``MaxentFeatureEncodingI``
interface.
    N)defaultdict)ClassifierI)
call_megamparse_megam_weightswrite_megam_file)	call_tadmparse_tadm_weightswrite_tadm_file)CutoffCheckeraccuracylog_likelihood)gzip_open_unicode)DictionaryProbDist)OrderedDictz
epytext enc                       \ rS rSrSrSS jrS rS rS rS r	S r
SS	 jrSS
 jrSS jrS r/ SQr\     SS j5       rSrg)MaxentClassifierN   a  
A maximum entropy classifier (also known as a "conditional
exponential classifier").  This classifier is parameterized by a
set of "weights", which are used to combine the joint-features
that are generated from a featureset by an "encoding".  In
particular, the encoding maps each ``(featureset, label)`` pair to
a vector.  The probability of each label is then computed using
the following equation::

                            dotprod(weights, encode(fs,label))
  prob(fs|label) = ---------------------------------------------------
                   sum(dotprod(weights, encode(fs,l)) for l in labels)

Where ``dotprod`` is the dot product::

  dotprod(a,b) = sum(x*y for (x,y) in zip(a,b))
c                 f    Xl         X l        X0l        UR                  5       [	        U5      :X  d   eg)a  
Construct a new maxent classifier model.  Typically, new
classifier models are created using the ``train()`` method.

:type encoding: MaxentFeatureEncodingI
:param encoding: An encoding that is used to convert the
    featuresets that are given to the ``classify`` method into
    joint-feature vectors, which are used by the maxent
    classifier model.

:type weights: list of float
:param weights:  The feature weight vector for this classifier.

:type logarithmic: bool
:param logarithmic: If false, then use non-logarithmic weights.
N)	_encoding_weights_logarithmiclengthlen)selfencodingweightslogarithmics       L/var/www/auris/envauris/lib/python3.13/site-packages/nltk/classify/maxent.py__init__MaxentClassifier.__init__a   s-    " "' CL000    c                 6    U R                   R                  5       $ N)r   labelsr   s    r   r$   MaxentClassifier.labelsx   s    ~~$$&&r!   c                 b    Xl         U R                  R                  5       [        U5      :X  d   eg)z
Set the feature weight vector for this classifier.
:param new_weights: The new feature weight vector.
:type new_weights: list of float
N)r   r   r   r   )r   new_weightss     r   set_weightsMaxentClassifier.set_weights{   s)     $~~$$&#k*::::r!   c                     U R                   $ )zO
:return: The feature weight vector for this classifier.
:rtype: list of float
)r   r%   s    r   r   MaxentClassifier.weights   s    
 }}r!   c                 @    U R                  U5      R                  5       $ r#   )prob_classifymax)r   
featuresets     r   classifyMaxentClassifier.classify   s    !!*-1133r!   c                 `   0 nU R                   R                  5        H{  nU R                   R                  X5      nU R                  (       a'  SnU H  u  pgXPR                  U   U-  -  nM     XRU'   MV  SnU H  u  pgXR                  U   U-  -  nM     XU'   M}     [        X R                  SS9$ )Ng              ?T)log	normalize)r   r$   encoder   r   r   )	r   r0   	prob_dictlabelfeature_vectortotalf_idf_valprods	            r   r.   MaxentClassifier.prob_classify   s    	^^**,E!^^22:EN  #1KD]]40588E $2#(%  #1KDMM$/588D $2#'%  -  ")1B1BdSSr!   c           	      z  ^ ^^ SnS[        US-
  5      -   S-   nT R                  U5      m[        TR                  5       TR                  SS9nUSU n[        SR                  U5      S	R                  S
 U 5       5      -   5        [        SSUS-
  S[        U5      -  -   -  -   5        [        [        5      m[        U5       H  u  pgT R                  R                  X5      nUR                  U 4S jSS9  U H  u  pT R                  (       a  T R                   U	   U
-  nOT R                   U	   U
-  nT R                  R#                  U	5      nUR%                  S5      S   nUSU
-  -  n[        U5      S:  a  USS S-   n[        XLUS-  S-  U4-  5        TU==   U-  ss'   M     M     [        SSUS-
  S[        U5      -  -   -  -   5        [        SR                  U5      S	R                  U4S jU 5       5      -   5        [        SR                  U5      S	R                  U4S jU 5       5      -   5        g)z
Print a table showing the effect of each of the features in
the given feature set, and how they combine to determine the
probabilities of each label for that featureset.
2   z  %-   zs%s%8.3fTkeyreverseNz	  Feature c              3   8   #    U  H  nS SU-  SS -  v   M     g7f)z%8s%sN    ).0ls     r   	<genexpr>+MaxentClassifier.explain.<locals>.<genexpr>   s     ?1eq"1~.s   z  -   c                 :   > [        TR                  U S      5      $ )Nr   absr   )fid__r   s    r   <lambda>*MaxentClassifier.explain.<locals>.<lambda>   s    #dmmE!H&=">r!    and label is r   z (%s)/   ,   z...    z  TOTAL:c              3   4   >#    U  H  nS TU   -  v   M     g7fz%8.3fNrJ   )rK   rL   sumss     r   rM   rN      s     3Vv!Gd1g4Evs   z  PROBS:c              3   L   >#    U  H  nS TR                  U5      -  v   M     g7fr]   )prob)rK   rL   pdists     r   rM   rN      s     >v!g

1-vs   !$)strr.   sortedsamplesr`   printljustjoinr   r   int	enumerater   r7   sortr   r   describesplit)r   r0   columnsdescr_widthTEMPLATEr$   ir9   r:   r<   r=   scoredescrra   r^   s   `            @@r   explainMaxentClassifier.explain   s$    Ca00:="":.UZZF!k*gg???@	
 	dSK!Oa#f+o=>>?3!&)HA!^^22:EN>     .$$ MM$/%7E MM$/58E//5$45a85(u:?!#2J.EhQe!<<=Uu$  . *" 	dSK!Oa#f+o=>>?[)BGG3Vv3V,VV	
 	[)gg>v>>?	
r!   c           	         ^  [        T S5      (       a  T R                  SU $ [        [        [	        [        T R                  5      5      5      U 4S jSS9T l        T R                  SU $ )zG
Generates the ranked list of informative features from most to least.
_most_informative_featuresNc                 4   > [        TR                  U    5      $ r#   rR   )fidr   s    r   rU   <MaxentClassifier.most_informative_features.<locals>.<lambda>   s    DMM#$6 7r!   TrC   )hasattrrv   rc   listranger   r   )r   ns   ` r   most_informative_features*MaxentClassifier.most_informative_features   sd     4566222A66.4U3t}}-./7/D+
 222A66r!   c                 r   U R                  S5      nUS:X  a'  U Vs/ s H  o@R                  U   S:  d  M  UPM     nnO,US:X  a&  U Vs/ s H  o@R                  U   S:  d  M  UPM     nnUSU  H:  n[        U R                  U   S SU R                  R	                  U5       35        M<     gs  snf s  snf )z
:param show: all, neg, or pos (for negative-only or positive-only)
:type show: str
:param n: The no. of top features
:type n: int
Nposr   negz8.3frZ   )r~   r   re   r   rk   )r   r}   showfidsrx   s        r   show_most_informative_features/MaxentClassifier.show_most_informative_features   s     --d35=#'B4C==+=+AC4DBDU]#'B4C==+=+AC4DB8CT]]3'-Qt~~/F/Fs/K.LMN  CBs   B/B/	B4#B4c                     S[        U R                  R                  5       5      U R                  R                  5       4-  $ )Nz:<ConditionalExponentialClassifier: %d labels, %d features>)r   r   r$   r   r%   s    r   __repr__MaxentClassifier.__repr__   s:    K%%'(NN!!#O
 
 	
r!   )GISIISMEGAMTADMNc                 T   Uc  SnU H  nUS;  d  M  [        SU-  5      e   UR                  5       nUS:X  a  [        XXE40 UD6$ US:X  a  [        XXE40 UD6$ US:X  a  [	        XXEU40 UD6$ US:X  a)  Un	X9S'   XIS'   XYS	'   XiS
'   [
        R                  " U40 U	D6$ [        SU-  5      e)aV  
Train a new maxent classifier based on the given corpus of
training samples.  This classifier will have its weights
chosen to maximize entropy while remaining empirically
consistent with the training corpus.

:rtype: MaxentClassifier
:return: The new maxent classifier

:type train_toks: list
:param train_toks: Training data, represented as a list of
    pairs, the first member of which is a featureset,
    and the second of which is a classification label.

:type algorithm: str
:param algorithm: A case-insensitive string, specifying which
    algorithm should be used to train the classifier.  The
    following algorithms are currently available.

    - Iterative Scaling Methods: Generalized Iterative Scaling (``'GIS'``),
      Improved Iterative Scaling (``'IIS'``)
    - External Libraries (requiring megam):
      LM-BFGS algorithm, with training performed by Megam (``'megam'``)

    The default algorithm is ``'IIS'``.

:type trace: int
:param trace: The level of diagnostic tracing output to produce.
    Higher values produce more verbose output.
:type encoding: MaxentFeatureEncodingI
:param encoding: A feature encoding, used to convert featuresets
    into feature vectors.  If none is specified, then a
    ``BinaryMaxentFeatureEncoding`` will be built based on the
    features that are attested in the training corpus.
:type labels: list(str)
:param labels: The set of possible labels.  If none is given, then
    the set of all labels attested in the training data will be
    used instead.
:param gaussian_prior_sigma: The sigma value for a gaussian
    prior on model weights.  Currently, this is supported by
    ``megam``. For other algorithms, its value is ignored.
:param cutoffs: Arguments specifying various conditions under
    which the training should be halted.  (Some of the cutoff
    conditions are not supported by some algorithms.)

    - ``max_iter=v``: Terminate after ``v`` iterations.
    - ``min_ll=v``: Terminate after the negative average
      log-likelihood drops under ``v``.
    - ``min_lldelta=v``: Terminate if a single iteration improves
      log likelihood by less than ``v``.
iis)	max_itermin_llmin_lldeltamax_accmin_accdeltacount_cutoffnormexplicit	bernoullizUnexpected keyword arg %rgismegamtadmtracer   r$   gaussian_prior_sigmazUnknown algorithm %s)	TypeErrorlower train_maxent_classifier_with_iis train_maxent_classifier_with_gis"train_maxent_classifier_with_megamTadmMaxentClassifiertrain
ValueError)
cls
train_toks	algorithmr   r   r$   r   cutoffsrD   kwargss
             r   r   MaxentClassifier.train   s   | IC 
 
   ;c ABB  OO%	387>  %387>  '!585IMT  & F#7O!):%8-A)*'--jCFCC3i?@@r!   )r   r   rv   r   )T)   )
   )r   all)N   NNr   )__name__
__module____qualname____firstlineno____doc__r   r$   r)   r   r1   r.   rs   r~   r   r   
ALGORITHMSclassmethodr   __static_attributes__rJ   r!   r   r   r   N   sj    $1.';4T(*
X7O 
 1J aA aAr!   r   c                   6    \ rS rSrSrS rS rS rS rS r	Sr
g	)
MaxentFeatureEncodingIid  ap  
A mapping that converts a set of input-feature values to a vector
of joint-feature values, given a label.  This conversion is
necessary to translate featuresets into a format that can be used
by maximum entropy models.

The set of joint-features used by a given encoding is fixed, and
each index in the generated joint-feature vectors corresponds to a
single joint-feature.  The length of the generated joint-feature
vectors is therefore constant (for a given encoding).

Because the joint-feature vectors generated by
``MaxentFeatureEncodingI`` are typically very sparse, they are
represented as a list of ``(index, value)`` tuples, specifying the
value of each non-zero joint-feature.

Feature encodings are generally created using the ``train()``
method, which generates an appropriate encoding based on the
input-feature values and labels that are present in a given
corpus.
c                     [        5       e)a  
Given a (featureset, label) pair, return the corresponding
vector of joint-feature values.  This vector is represented as
a list of ``(index, value)`` tuples, specifying the value of
each non-zero joint-feature.

:type featureset: dict
:rtype: list(tuple(int, int))
NotImplementedErrorr   r0   r9   s      r   r7   MaxentFeatureEncodingI.encode{       "##r!   c                     [        5       e)zr
:return: The size of the fixed-length joint-feature vectors
    that are generated by this encoding.
:rtype: int
r   r%   s    r   r   MaxentFeatureEncodingI.length       "##r!   c                     [        5       e)z
:return: A list of the "known labels" -- i.e., all labels
    ``l`` such that ``self.encode(fs,l)`` can be a nonzero
    joint-feature vector for some value of ``fs``.
:rtype: list
r   r%   s    r   r$   MaxentFeatureEncodingI.labels  s     "##r!   c                     [        5       e)z
:return: A string describing the value of the joint-feature
    whose index in the generated feature vectors is ``fid``.
:rtype: str
r   r   rx   s     r   rk   MaxentFeatureEncodingI.describe  r   r!   c                     [        5       e)a7  
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
    pairs, the first member of which is a feature dictionary,
    and the second of which is a classification label.
r   )r   r   s     r   r   MaxentFeatureEncodingI.train  r   r!   rJ   N)r   r   r   r   r   r7   r   r$   rk   r   r   rJ   r!   r   r   r   d  s     ,
$$$$
$r!   r   c                   6    \ rS rSrSrS rS rS rS rS r	Sr
g	)
#FunctionBackedMaxentFeatureEncodingi  z
A feature encoding that calls a user-supplied function to map a
given featureset/label pair to a sparse joint-feature vector.
c                 (    X l         Xl        X0l        g)a  
Construct a new feature encoding based on the given function.

:type func: (callable)
:param func: A function that takes two arguments, a featureset
     and a label, and returns the sparse joint feature vector
     that encodes them::

         func(featureset, label) -> feature_vector

     This sparse joint feature vector (``feature_vector``) is a
     list of ``(index,value)`` tuples.

:type length: int
:param length: The size of the fixed-length joint-feature
    vectors that are generated by this encoding.

:type labels: list
:param labels: A list of the "known labels" for this
    encoding -- i.e., all labels ``l`` such that
    ``self.encode(fs,l)`` can be a nonzero joint-feature vector
    for some value of ``fs``.
N)_length_func_labels)r   funcr   r$   s       r   r   ,FunctionBackedMaxentFeatureEncoding.__init__  s    0 
r!   c                 $    U R                  X5      $ r#   )r   r   s      r   r7   *FunctionBackedMaxentFeatureEncoding.encode  s    zz*,,r!   c                     U R                   $ r#   r   r%   s    r   r   *FunctionBackedMaxentFeatureEncoding.length      ||r!   c                     U R                   $ r#   r   r%   s    r   r$   *FunctionBackedMaxentFeatureEncoding.labels  r   r!   c                     g)Nzno description availablerJ   r   s     r   rk   ,FunctionBackedMaxentFeatureEncoding.describe  s    )r!   )r   r   r   N)r   r   r   r   r   r   r7   r   r$   rk   r   rJ   r!   r   r   r     s     
8-*r!   r   c                   N    \ rS rSrSrSS jrS rS rS rS r	\
SS	 j5       rS
rg)BinaryMaxentFeatureEncodingi  a  
A feature encoding that generates vectors containing a binary
joint-features of the form:

|  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|                      {
|                      { 0 otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value
for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training
corpus, using the ``train()`` method.  This method will create one
feature for each combination of ``fname``, ``fval``, and ``label``
that occurs at least once in the training corpus.

The ``unseen_features`` parameter can be used to add "unseen-value
features", which are used whenever an input feature has a value
that was not encountered in the training corpus.  These features
have the form:

|  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|                      {      and l == label
|                      {
|                      { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not
contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on
features", which have the form::

|  joint_feat(fs, l) = { 1 if (l == label)
|                      {
|                      { 0 otherwise

These always-on features allow the maxent model to directly model
the prior probabilities of each label.
c                    [        UR                  5       5      [        [        [        U5      5      5      :w  a  [	        S5      e[        U5      U l         X l         [        U5      U l         SU l	         SU l
         U(       aZ  [        U5       VVs0 s H  u  pVXeU R                  -   _M     snnU l	        U =R                  [        U R                  5      -  sl        U(       ai  U VVVs1 s H  u  pxogiM	     n	nnn[        U	5       VVs0 s H  u  pWXuU R                  -   _M     snnU l
        U =R                  [        U	5      -  sl        ggs  snnf s  snnnf s  snnf )a[  
:param labels: A list of the "known labels" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``
    tuples to corresponding joint-feature indexes.  These
    indexes must be the set of integers from 0...len(mapping).
    If ``mapping[fname,fval,label]=id``, then
    ``self.encode(..., fname:fval, ..., label)[id]`` is 1;
    otherwise, it is 0.

:param unseen_features: If true, then include unseen value
   features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on
   features in the generated joint-feature vectors.
HMapping values must be exactly the set of integers from 0...len(mapping)Nsetvaluesr|   r   r   r{   r   _mappingr   	_alwayson_unseenri   
r   r$   mappingunseen_featuresalwayson_featuresrp   r9   fnamefvalfnamess
             r   r   $BinaryMaxentFeatureEncoding.__init__  *   " w~~ Cc'l(;$<<8 
 F|(97|<,,:CF:K:KJQ4<<'':KDN LLC//L8?@ 4eeF@FOPVFWXFW
Et||#33FWXDLLLCK'L  AX   E+EEc                     / nUR                  5        H  u  pEXEU4U R                  ;   a$  UR                  U R                  XEU4   S45        M;  U R                  (       d  MN  U R                   H  nXEU4U R                  ;   d  M    Mt     X@R                  ;   d  M  UR                  U R                  U   S45        M     U R
                  (       a/  X R
                  ;   a   UR                  U R
                  U   S45        U$ Nr[   )itemsr   appendr   r   r   r   r0   r9   r   r   r   label2s          r   r7   "BinaryMaxentFeatureEncoding.encode6  s     &++-KEU#t}}4uE/A!BA FG "llFV,= +
 , e)<a(@A ." >>e~~5OOT^^E2A67r!   c                    [        U[        5      (       d  [        S5      e U R                    U[        U R                  5      :  a  U R                  U   u  pEnU SU< SU< 3$ U R                  (       aM  XR                  R                  5       ;   a0  U R                  R                  5        H  u  pgX:X  d  M  SU-  s  $    g U R                  (       aM  XR                  R                  5       ;   a0  U R                  R                  5        H  u  pGX:X  d  M  SU-  s  $    g [        S5      e! [         aS    S/[        U R                  5      -  U l        U R                  R                  5        H  u  p#X R                  U'   M      GNZf = fNzdescribe() expected an intz==rW   zlabel is %rz%s is unseenzBad feature id
isinstancerh   r   _inv_mappingAttributeErrorr   r   r   r   r   r   r   r   r<   inforp   r   r   r9   f_id2s           r   rk   $BinaryMaxentFeatureEncoding.describeQ  L   $$$899	, #dmm$$#'#4#4T#: U%WBthnUI>>^^(=(=(? ? $ 4 4 6=(500 !7 \\dll&9&9&;; $ 2 2 4=)E11 !5 -..#  	,!#s4=='9 9D==..0'+!!!$ 1	,   D, ,AF	F	c                     U R                   $ r#   r   r%   s    r   r$   "BinaryMaxentFeatureEncoding.labelsj      ||r!   c                     U R                   $ r#   r   r%   s    r   r   "BinaryMaxentFeatureEncoding.lengthn  r  r!   Nc                 X   0 n[        5       n[        [        5      nU H|  u  pU(       a  X;  a  [        SU	-  5      eUR	                  U	5        UR                  5        H8  u  pXzU4==   S-  ss'   XzU4   U:  d  M  XU	4U;  d  M)  [        U5      XZX4'   M:     M~     Uc  UnU " X540 UD6$ )a  
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.  See the class description
``BinaryMaxentFeatureEncoding`` for a description of the
joint-features that will be included in this encoding.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
    pairs, the first member of which is a feature dictionary,
    and the second of which is a classification label.

:type count_cutoff: int
:param count_cutoff: A cutoff value that is used to discard
    rare joint-features.  If a joint-feature's value is 1
    fewer than ``count_cutoff`` times in the training corpus,
    then that joint-feature is not included in the generated
    encoding.

:type labels: list
:param labels: A list of labels that should be used by the
    classifier.  If not specified, then the set of labels
    attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as
    ``unseen_features`` and ``alwayson_features``.
Unexpected label %sr[   )r   r   rh   r   addr   r   r   r   r   r$   optionsr   seen_labelscounttokr9   r   r   s               r   r   !BinaryMaxentFeatureEncoding.trainr  s    8 eC $JC%- !6!>??OOE"  #yy{ Tk"a'"%5U+7:69'lt 23  + % > F6.g..r!   r   r   r   r   r   r   FFr   Nr   r   r   r   r   r   r7   rk   r$   r   r   r   r   rJ   r!   r   r   r     s6    &P/(b6/2 0/ 0/r!   r   c                   F    \ rS rSrSr S
S jr\S 5       rS rS r	S r
S	rg)GISEncodingi  a  
A binary feature encoding which adds one new joint-feature to the
joint-features defined by ``BinaryMaxentFeatureEncoding``: a
correction feature, whose value is chosen to ensure that the
sparse vector always sums to a constant non-negative number.  This
new feature is used to ensure two preconditions for the GIS
training algorithm:

  - At least one feature vector index must be nonzero for every
    token.
  - The feature vector must sum to a constant non-negative number
    for every token.
Nc           
          [         R                  XX#U5        Uc%  [        U VVVs1 s H  u  pgoiM	     snnn5      S-   nXPl        gs  snnnf )z
:param C: The correction constant.  The value of the correction
    feature is based on this value.  In particular, its value is
    ``C - sum([v for (f,v) in encoding])``.
:seealso: ``BinaryMaxentFeatureEncoding.__init__``
Nr[   )r   r   r   _C)	r   r$   r   r   r   Cr   r   r9   s	            r   r   GISEncoding.__init__  sO     	$,,'4E	
 9w?w3UUw?@1DA @s   Ac                     U R                   $ )zGThe non-negative constant that all encoded feature vectors
will sum to.r  r%   s    r   r  GISEncoding.C  s     wwr!   c                     [         R                  XU5      n[         R                  U 5      n[        S U 5       5      nXPR                  :  a  [        S5      eUR                  X@R                  U-
  45        U$ )Nc              3   *   #    U  H	  u  pUv   M     g 7fr#   rJ   )rK   fvs      r   rM   %GISEncoding.encode.<locals>.<genexpr>  s     -H&1AH   z&Correction feature is not high enough!)r   r7   r   sumr  r   r   )r   r0   r9   r   base_lengthr;   s         r   r7   GISEncoding.encode  sj    .55dN188> -H--GGEFFggo67 r!   c                 2    [         R                  U 5      S-   $ r   )r   r   r%   s    r   r   GISEncoding.length  s    *11$7!;;r!   c                 |    U[         R                  U 5      :X  a  SU R                  -  $ [         R                  X5      $ )NzCorrection feature (%s))r   r   r  rk   )r   r<   s     r   rk   GISEncoding.describe  s6    .55d;;,tww66.77CCr!   r  )FFN)r   r   r   r   r   r   propertyr  r7   r   rk   r   rJ   r!   r   r  r    s7     RV   
<Dr!   r  c                   J    \ rS rSrS
S jrS rS rS rS r\	SS j5       r
S	rg)TadmEventMaxentFeatureEncodingi  c                     [        U5      U l        [        5       U l        [        R	                  XU R                  X45        g r#   )r   r   _label_mappingr   r   )r   r$   r   r   r   s        r   r   'TadmEventMaxentFeatureEncoding.__init__  s1    #G,)m#,,$--	
r!   c                    / nUR                  5        H  u  pEXB4U R                  ;  a#  [        U R                  5      U R                  XB4'   XPR                  ;  aF  [	        U[
        5      (       d#  [        U R                  5      U R                  U'   OXPR                  U'   UR                  U R                  XB4   U R                  U   45        M     U$ r#   )r   r   r   r1  r   rh   r   )r   r0   r9   r   featurevalues         r   r7   %TadmEventMaxentFeatureEncoding.encode  s    (..0NGt}}425dmm2Dw.////!%--14T5H5H1ID''.16''.OO/0$2E2Ee2LM 1 r!   c                     U R                   $ r#   r   r%   s    r   r$   %TadmEventMaxentFeatureEncoding.labels  r   r!   c                 `    U R                    H  u  p#U R                   X#4   U:X  d  M  X#4s  $    g r#   )r   )r   rx   r4  r9   s       r   rk   'TadmEventMaxentFeatureEncoding.describe  s/    "mmNG}}g-.#5'' ,r!   c                 ,    [        U R                  5      $ r#   )r   r   r%   s    r   r   %TadmEventMaxentFeatureEncoding.length  s    4==!!r!   Nc                     [        5       nU(       d  / n[        U5      nU H  u  pgXs;  d  M  UR                  U5        M     U H/  u  pgU H$  nU H  nX4U;  d  M  [        U5      XXU4'   M     M&     M1     U " X540 UD6$ r#   )r   r{   r   r   )	r   r   r   r$   r  r   r0   r9   r4  s	            r   r   $TadmEventMaxentFeatureEncoding.train  s    -F *%
!+J"e$ ", ",J)G'w647L% 01  *   ", 6.g..r!   )r1  r   r  r  )r   r   r   r   r   r7   r$   rk   r   r   r   r   rJ   r!   r   r/  r/    s/    
(
" / /r!   r/  c                   N    \ rS rSrSrSS jrS rS rS rS r	\
SS	 j5       rS
rg)TypedMaxentFeatureEncodingi  a  
A feature encoding that generates vectors containing integer,
float and binary joint-features of the form:

Binary (for string and boolean features):

|  joint_feat(fs, l) = { 1 if (fs[fname] == fval) and (l == label)
|                      {
|                      { 0 otherwise

Value (for integer and float features):

|  joint_feat(fs, l) = { fval if     (fs[fname] == type(fval))
|                      {         and (l == label)
|                      {
|                      { not encoded otherwise

Where ``fname`` is the name of an input-feature, ``fval`` is a value
for that input-feature, and ``label`` is a label.

Typically, these features are constructed based on a training
corpus, using the ``train()`` method.

For string and boolean features [type(fval) not in (int, float)]
this method will create one feature for each combination of
``fname``, ``fval``, and ``label`` that occurs at least once in the
training corpus.

For integer and float features [type(fval) in (int, float)] this
method will create one feature for each combination of ``fname``
and ``label`` that occurs at least once in the training corpus.

For binary features the ``unseen_features`` parameter can be used
to add "unseen-value features", which are used whenever an input
feature has a value that was not encountered in the training
corpus.  These features have the form:

|  joint_feat(fs, l) = { 1 if is_unseen(fname, fs[fname])
|                      {      and l == label
|                      {
|                      { 0 otherwise

Where ``is_unseen(fname, fval)`` is true if the encoding does not
contain any joint features that are true when ``fs[fname]==fval``.

The ``alwayson_features`` parameter can be used to add "always-on
features", which have the form:

|  joint_feat(fs, l) = { 1 if (l == label)
|                      {
|                      { 0 otherwise

These always-on features allow the maxent model to directly model
the prior probabilities of each label.
c                    [        UR                  5       5      [        [        [        U5      5      5      :w  a  [	        S5      e[        U5      U l         X l         [        U5      U l         SU l	         SU l
         U(       aZ  [        U5       VVs0 s H  u  pVXeU R                  -   _M     snnU l	        U =R                  [        U R                  5      -  sl        U(       ai  U VVVs1 s H  u  pxogiM	     n	nnn[        U	5       VVs0 s H  u  pWXuU R                  -   _M     snnU l
        U =R                  [        U	5      -  sl        ggs  snnf s  snnnf s  snnf )a]  
:param labels: A list of the "known labels" for this encoding.

:param mapping: A dictionary mapping from ``(fname,fval,label)``
    tuples to corresponding joint-feature indexes.  These
    indexes must be the set of integers from 0...len(mapping).
    If ``mapping[fname,fval,label]=id``, then
    ``self.encode({..., fname:fval, ...``, label)[id]} is 1;
    otherwise, it is 0.

:param unseen_features: If true, then include unseen value
   features in the generated joint-feature vectors.

:param alwayson_features: If true, then include always-on
   features in the generated joint-feature vectors.
r   Nr   r   s
             r   r   #TypedMaxentFeatureEncoding.__init__T  r   r   c                    / nUR                  5        GH  u  pE[        U[        [        45      (       aL  U[	        U5      U4U R
                  ;   a.  UR                  U R
                  U[	        U5      U4   U45        Mk  Mm  XEU4U R
                  ;   a$  UR                  U R
                  XEU4   S45        M  U R                  (       d  M  U R                   H  nXEU4U R
                  ;   d  M    M     X@R                  ;   d  M  UR                  U R                  U   S45        GM     U R                  (       a/  X R                  ;   a   UR                  U R                  U   S45        U$ r   )
r   r   rh   floattyper   r   r   r   r   r   s          r   r7   !TypedMaxentFeatureEncoding.encode  s&    &++-KE$e--4:u->OOT]]5$t*e3K%Ld$ST ? '4==8OOT]]53E%F$JK \\\"&,,!0DMMA! #/
 !LL0$OOT\\%-@!,DE' ., >>e~~5OOT^^E2A67r!   c                    [        U[        5      (       d  [        S5      e U R                    U[        U R                  5      :  a  U R                  U   u  pEnU SU< SU< 3$ U R                  (       aM  XR                  R                  5       ;   a0  U R                  R                  5        H  u  pgX:X  d  M  SU-  s  $    g U R                  (       aM  XR                  R                  5       ;   a0  U R                  R                  5        H  u  pGX:X  d  M  SU-  s  $    g [        S5      e! [         aS    S/[        U R                  5      -  U l        U R                  R                  5        H  u  p#X R                  U'   M      GNZf = fr   r   r   s           r   rk   #TypedMaxentFeatureEncoding.describe  r  r  c                     U R                   $ r#   r   r%   s    r   r$   !TypedMaxentFeatureEncoding.labels  r  r!   c                     U R                   $ r#   r   r%   s    r   r   !TypedMaxentFeatureEncoding.length  r  r!   Nc                    0 n[        5       n[        [        5      nU H  u  pU(       a  X;  a  [        SU	-  5      eUR	                  U	5        UR                  5        H\  u  p[        U5      [        [        4;   a  [        U5      nXzU4==   S-  ss'   XzU4   U:  d  MC  XU	4U;  d  MM  [        U5      XZX4'   M^     M     Uc  UnU " X540 UD6$ )aq  
Construct and return new feature encoding, based on a given
training corpus ``train_toks``.  See the class description
``TypedMaxentFeatureEncoding`` for a description of the
joint-features that will be included in this encoding.

Note: recognized feature values types are (int, float), over
types are interpreted as regular binary features.

:type train_toks: list(tuple(dict, str))
:param train_toks: Training data, represented as a list of
    pairs, the first member of which is a feature dictionary,
    and the second of which is a classification label.

:type count_cutoff: int
:param count_cutoff: A cutoff value that is used to discard
    rare joint-features.  If a joint-feature's value is 1
    fewer than ``count_cutoff`` times in the training corpus,
    then that joint-feature is not included in the generated
    encoding.

:type labels: list
:param labels: A list of labels that should be used by the
    classifier.  If not specified, then the set of labels
    attested in ``train_toks`` will be used.

:param options: Extra parameters for the constructor, such as
    ``unseen_features`` and ``alwayson_features``.
r  r[   )	r   r   rh   r   r  r   rE  rD  r   r  s               r   r    TypedMaxentFeatureEncoding.train  s    > eC $JC%- !6!>??OOE"  #yy{:#u-:D Tk"a'"%5U+7:69'lt 23  + %" > F6.g..r!   r  r  r  r  rJ   r!   r   r@  r@    s7    6p/(b@/2 5/ 5/r!   r@  c                    UR                  SS5        [        U5      nUc  [        R                  XS9n[	        US5      (       d  [        S5      eSUR                  -  n[        X5      n[        [        R                  " US:H  5      S   5      n[        R                  " [        U5      S5      n	U H  n
[        R                  X'   M     [        X)5      n[        R                  " U5      nAUS:  a  [!        S	US   -  5        US
:  a   [!        5         [!        S5        [!        S5          US
:  aX  UR"                  =(       d    [%        X5      nUR&                  =(       d    [)        X5      nUR*                  n[!        SXU4-  5        [-        XU5      nU H  n
UU
==   S-  ss'   M     [        R                  " U5      nAUR/                  5       n	XU-
  U-  -  n	UR1                  U	5        UR3                  X5      (       a  OM  US
:  a)  [%        X5      n[)        X5      n[!        SUS SUS 35        U$ ! [4         a    [!        S5         NH  e = f)aj  
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Generalized Iterative Scaling
algorithm.  This ``ConditionalExponentialClassifier`` will encode
the model that maximizes entropy from all the models that are
empirically consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
r   d   r$   r  zJThe GIS algorithm requires an encoding that defines C (e.g., GISEncoding).r4   r   d  ==> Training (%d iterations)rB   -      Iteration    Log Likelihood    Accuracy-      ---------------------------------------     %9d    %14.5f    %9.3fr[   *      Training stopped: keyboard interrupt         Final    14.5f    9.3f)
setdefaultr   r  r   rz   r   r  calculate_empirical_fcountr   numpynonzerozerosr   NINF ConditionalExponentialClassifierlog2re   llr   accr   itercalculate_estimated_fcountr   r)   checkKeyboardInterrupt)r   r   r   r$   r   cutoffcheckerCinvempirical_fcount
unattestedr   rx   
classifierlog_empirical_fcountrd  re  iternumestimated_fcountlog_estimated_fcounts                     r   r   r     sZ    z3'!'*M $$Z$?8S!!-
 	
 D 2*G U]]#3q#89!<=J kk#./5Gzz 1(DJ !::&67qy.1DDEqy=>=> qy"%%O
)O#''K8J+K',,3wC6HHI  : 
 " %*% "#(::.>#?   !((*G/CCtKKG""7+ "":::5 B qyJ3z."2e*DT
;<   <:;s   *CH7 H7 7IIc                     [         R                  " UR                  5       S5      nU  H+  u  p4UR                  X45       H  u  pVX%==   U-  ss'   M     M-     U$ NrR  )r^  r`  r   r7   )r   r   fcountr  r9   indexvals          r   r]  r]  f  sM    [[*C0F 
"//#5JEMS M 6 ! Mr!   c                 (   [         R                  " UR                  5       S5      nU Hf  u  pEU R                  U5      nUR	                  5        H<  nUR                  U5      nUR                  XE5       H  u  pX8==   Xy-  -  ss'   M     M>     Mh     U$ rt  )r^  r`  r   r.   rd   r`   r7   )
rn  r   r   ru  r  r9   ra   r`   rx   r   s
             r   rg  rg  p  s}    [[*C0F 
((-]]_E::e$D%__S8	t{* 9 % ! Mr!   c           
         UR                  SS5        [        U5      nUc  [        R                  XS9n[	        X5      [        U 5      -  n[        X5      n[        R                  " [        XwR                  S9S5      n[        R                  " U[        U5      S45      n	[        [        R                  " US:H  5      S   5      n
[        R                  " [        U5      S5      nU
 H  n[        R                  X'   M     [!        X+5      nUS:  a  [#        SUS   -  5        US	:  a   [#        5         [#        S
5        [#        S5          US	:  aX  UR$                  =(       d    ['        X5      nUR(                  =(       d    [+        X5      nUR,                  n[#        SUX4-  5        [/        U UU
UUUU	U5      nUR1                  5       nUU-  nUR3                  U5        UR5                  X5      (       a  OM  US	:  a)  ['        X5      n[+        X5      n[#        SUS SUS 35        U$ ! [6         a    [#        S5         NH  e = f)af  
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the Improved Iterative Scaling algorithm.
This ``ConditionalExponentialClassifier`` will encode the model
that maximizes entropy from all the models that are empirically
consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
r   rP  rQ  )rD   rR  r[   r   rS  rB   rT  rU  rV  rW  rX  rY  rZ  r[  )r\  r   r   r   r]  r   calculate_nfmapr^  arrayrc   __getitem__reshaper   r_  r`  ra  rb  re   rd  r   re  r   rf  calculate_deltasr   r)   rh  ri  )r   r   r   r$   r   rj  empirical_ffreqnfmapnfarraynftransposerm  r   rx   rn  rd  re  rp  deltass                     r   r   r     s6    z3'!'*M .44Z4O 1FZXO J1Ekk&,=,=>DG--#g,):;K U]]?a#78;<J kk#o.4Gzz 1(DJqy.1DDEqy=>=> qy"%%O
)O#''K8J+K',,3w6HHI &	F !((*GvG""7+ "":::5 B qyJ3z."2e*DT
;<   <:;s   	B-H* 7H* *IIc                    [        5       nU  HL  u  p4UR                  5        H3  nUR                  [        S UR	                  X55       5       5      5        M5     MN     [        U5       VVs0 s H  u  pgXv_M	     snn$ s  snnf )a  
Construct a map that can be used to compress ``nf`` (which is
typically sparse).

*nf(feature_vector)* is the sum of the feature values for
*feature_vector*.

This represents the number of features that are active for a
given labeled text.  This method finds all values of *nf(t)*
that are attested for at least one token in the given list of
training tokens; and constructs a dictionary mapping these
attested values to a continuous range *0...N*.  For example,
if the only values of *nf()* that were attested were 3, 5, and
7, then ``_nfmap`` might return the dictionary ``{3:0, 5:1, 7:2}``.

:return: A map that can be used to compress ``nf`` to a dense
    vector.
:rtype: dict(int -> int)
c              3   *   #    U  H	  u  pUv   M     g 7fr#   rJ   rK   idrw  s      r   rM   "calculate_nfmap.<locals>.<genexpr>  s     K/J)2#/Jr%  )r   r$   r  r&  r7   ri   )r   r   nfsetr  _r9   rp   nfs           r   rz  rz    sm    * EE__&EIIcKxs/JKKL '  "+5!12!1gqBE!1222s   +A=c           	      |   SnSn	[         R                  " UR                  5       S5      n
[         R                  " [	        U5      UR                  5       4S5      nU  H  u  pUR                  U5      nUR                  5        HU  nUR                  X5      n[        S U 5       5      nU H)  u  nnXU   U4==   UR                  U5      U-  -  ss'   M+     MW     M     U[	        U 5      -  n[        U	5       H  n[         R                  " XZ5      nSU-  nUU-  n[         R                  " UU-  SS9n[         R                  " UU-  SS9nU H  nUU==   S-  ss'   M     XU-
  U* -  -  n
[         R                  " [        UU-
  5      5      [         R                  " [        U
5      5      -  nUU:  d  M  U
s  $    U
$ )	aW	  
Calculate the update values for the classifier weights for
this iteration of IIS.  These update weights are the value of
``delta`` that solves the equation::

  ffreq_empirical[i]
         =
  SUM[fs,l] (classifier.prob_classify(fs).prob(l) *
             feature_vector(fs,l)[i] *
             exp(delta[i] * nf(feature_vector(fs,l))))

Where:
    - *(fs,l)* is a (featureset, label) tuple from ``train_toks``
    - *feature_vector(fs,l)* = ``encoding.encode(fs,l)``
    - *nf(vector)* = ``sum([val for (id,val) in vector])``

This method uses Newton's method to solve this equation for
*delta[i]*.  In particular, it starts with a guess of
``delta[i]`` = 1; and iteratively updates ``delta`` with:

| delta[i] -= (ffreq_empirical[i] - sum1[i])/(-sum2[i])

until convergence, where *sum1* and *sum2* are defined as:

|    sum1[i](delta) = SUM[fs,l] f[i](fs,l,delta)
|    sum2[i](delta) = SUM[fs,l] (f[i](fs,l,delta).nf(feature_vector(fs,l)))
|    f[i](fs,l,delta) = (classifier.prob_classify(fs).prob(l) .
|                        feature_vector(fs,l)[i] .
|                        exp(delta[i] . nf(feature_vector(fs,l))))

Note that *sum1* and *sum2* depend on ``delta``; so they need
to be re-computed each iteration.

The variables ``nfmap``, ``nfarray``, and ``nftranspose`` are
used to generate a dense encoding for *nf(ltext)*.  This
allows ``_deltas`` to calculate *sum1* and *sum2* using
matrices, which yields a significant performance improvement.

:param train_toks: The set of training tokens.
:type train_toks: list(tuple(dict, str))
:param classifier: The current classifier.
:type classifier: ClassifierI
:param ffreq_empirical: An array containing the empirical
    frequency for each feature.  The *i*\ th element of this
    array is the empirical frequency for feature *i*.
:type ffreq_empirical: sequence of float
:param unattested: An array that is 1 for features that are
    not attested in the training data; and 0 for features that
    are attested.  In other words, ``unattested[i]==0`` iff
    ``ffreq_empirical[i]==0``.
:type unattested: sequence of int
:param nfmap: A map that can be used to compress ``nf`` to a dense
    vector.
:type nfmap: dict(int -> int)
:param nfarray: An array that can be used to uncompress ``nf``
    from a dense vector.
:type nfarray: array(float)
:param nftranspose: The transpose of ``nfarray``
:type nftranspose: array(float)
g-q=i,  rR  c              3   *   #    U  H	  u  pUv   M     g 7fr#   rJ   r  s      r   rM   #calculate_deltas.<locals>.<genexpr>T  s     9.YbS.r%  rB   r   )axisr[   )r^  onesr   r`  r   r.   r$   r7   r&  r`   r|   outerrS   )r   rn  rm  ffreq_empiricalr  r  r  r   NEWTON_CONVERGE
MAX_NEWTONr  Ar  r9   distr:   r  r  rw  rangenumnf_deltaexp_nf_deltanf_exp_nf_deltasum1sum2rx   n_errors                              r   r~  r~    s   R OJZZ)3/F
 	SZ!23S9A 
'',__&E%__S8N9.99B)C)R- DIIe$4s$::  * ' ! ZA *%;;w/({%4yy)2yy1,15 CINI  	T)dU22 ))C$ 678599S[;QQ_$M# && Mr!   c           	         SnSnSU;   a  US   nSU;   a  US   nUc(  UR                  SS5      n[        R                  XUSS9nOUb  [        S5      e [        R
                  " S	S
9u  p[        U
S5       n[        XXUS9  SSS5        [        R                  " U	5        / nU/ SQ-  nU(       a  US/-  nU(       d  US/-  nU(       a	  SUS-  -  nOSnUSSU-  S/-  nUS:  a  US/-  nSU;   a  USSUS   -  /-  nSU;   a  USS[        US   5      -  /-  n[        US5      (       a  US/-  nUSU
/-  n[        U5      n [        R                  " U
5        [!        XR#                  5       U5      nU[$        R&                  " [$        R(                  5      -  n[+        UU5      $ ! , (       d  f       GN-= f! [        [        4 a  n[        SU-  5      UeSnAff = f! [         a  n[        S U
 S!U 35         SnANSnAff = f)"a{  
Train a new ``ConditionalExponentialClassifier``, using the given
training samples, using the external ``megam`` library.  This
``ConditionalExponentialClassifier`` will encode the model that
maximizes entropy from all the models that are empirically
consistent with ``train_toks``.

:see: ``train_maxent_classifier()`` for parameter descriptions.
:see: ``nltk.classify.megam``
Tr   r   Nr   r   )r$   r   z$Specify encoding or labels, not bothznltk-prefixw)r   r   z,Error while creating megam training file: %s)z-nobiasz-repeat10z	-explicitz-fvalsr4   rB   z-lambdaz%.2fz-tuner   z-quietr   z-maxirH   ll_deltaz-dppcostz-multilabel
multiclasszWarning: unable to delete z: )getr   r   r   tempfilemkstempopenr   oscloseOSErrorrS   rz   r   removere   r   r   r^  rc  er   )r   r   r   r$   r   r   r   r   r   fdtrainfile_name	trainfiler  r  inv_variancestdoutr   s                    r   r   r     sc    HIV*%f;'	  zz.!4.44Vt 5 
 
	?@@T%--W=.#&)ii ' 	
 G++GK= H: 1144	6L0'::GqyH:VGTF:$6677V 	FD3vj'9#::;;x  M?"n--G FB
		.!
 "&//*;XFG uzz%''""G Hg..c '&
 Z  TG!KLRSSTD  B*>*:"QC@AABsH   "F; 7F)F; G" )
F83F; ;GGG"
H,HHc                   $    \ rS rSr\S 5       rSrg)r   i  c                    UR                  SS5      nUR                  SS5      nUR                  SS 5      nUR                  SS 5      nUR                  SS5      nUR                  S	S5      nUR                  S
5      n	UR                  S5      n
U(       d  [        R                  XUS9n[        R                  " SSS9u  p[        R                  " SS9u  p[        US5      n[        XU5        UR                  5         / nUR                  S/5        UR                  SU/5        U(       a  UR                  SSUS-  -  /5        U	(       a  UR                  SSU	-  /5        U
(       a  UR                  SS[        U
5      -  /5        UR                  SU/5        UR                  SU/5        US:  a  UR                  S/5        OUR                  S/5        [        U5        [        U5       n[        U5      nS S S 5        [        R                  " U5        [        R                  " U5        W[        R                   " [        R"                  5      -  nU " UU5      $ ! , (       d  f       Nj= f)Nr   tao_lmvmr   r   r   r$   r   r   r   r   r   rQ  znltk-tadm-events-z.gz)r  suffixznltk-tadm-weights-r  r  z-monitorz-methodz-l2z%.6frB   z-max_itz%dz-fatolz
-events_inz-params_outz2>&1z-summary)r  r/  r   r  r  r   r
   r  extendrS   r   r  r	   r  r  r^  rc  r  )r   r   r   r   r   r   r$   sigmar   r   r  trainfile_fdr  weightfile_fdweightfile_namer  r  
weightfiler   s                      r   r   TadmMaxentClassifier.train  s   JJ{J7	

7A&::j$/Hd+

115zz.!4::j)::m, 5;; < H (0'7'7&u(
$ *2)9)9AU)V&%nc:	
i8
|$	9-.NNE6E1H#456NNIth78NNHfs8}&<=>n567819NNF8$NNJ<('/"j(4G # 			.!
		/" 	5::egg&& 8W%% #"s   0I  
I.rJ   N)r   r   r   r   r   r   r   rJ   r!   r   r   r     s    5& 5&r!   r   c                 R   SS K nSSKJn  U" 5       n[        U  S35       nUR                  " [        [        UR                  UR                  U5      5      5      5      nS S S 5        [        U  S35       nUR                  U5      nS S S 5        [        U  S35       nUR                  U5      nS S S 5        [        U  S35       nUR                  U5      nS S S 5        WWWW4$ ! , (       d  f       N= f! , (       d  f       Nu= f! , (       d  f       N^= f! , (       d  f       NG= f)Nr   )MaxentDecoder/weights.txt/mapping.tab/labels.txt/alwayson.tab)r^  nltk.tabdatar  r  r{  r{   mapfloat64txt2listtupkey2dict
tab2ivdict)	tab_dirr^  r  mdecr"  wgtmpglabaons	            r   load_maxent_paramsr    s    *?D		&	'1kk$s5==$--2BCDE 
( 
	&	'1q! 
( 
	%	&!mmA 
' 
	'	(Aooa  
) S# 
(	' 
(	' 
'	& 
)	(s/   ?C%5C6DD%
C36
D
D
D&c           
         SSK Jn  SSKJn  SSKJn  U" 5       nU" U5      (       d  U" U5        [        SU 35        [        U S3S5       n	U	R                  UR                  [        [        U R                  5       5      5       5        S S S 5        [        U S3S5       n	U	R                  UR                  U5       5        S S S 5        [        U S	3S5       n	U	R                  UR                  U5       5        S S S 5        [        U S
3S5       n	U	R                  UR                  U5       5        S S S 5        g ! , (       d  f       N= f! , (       d  f       N= f! , (       d  f       Nj= f! , (       d  f       g = f)Nr   )mkdir)isdir)MaxentEncoderzSaving Maxent parameters in r  r  r  r  r  )r  r  os.pathr  r  r  re   r  writelist2txtr  reprtolisttupdict2tab
ivdict2tab)
r  r  r  r  r  r  r  r  mencr"  s
             r   save_maxent_paramsr  2  s"   *?D>>g	(	
23		&	,	4==T3::<!89:< 
-		&	,	4##C()+ 
-		%s	+q	4==%&( 
,		'	-	4??3'(* 
.	- 
-	,	,	,	+	+	-	-s0   >D>!"E"E "E1>
E
E 
E.1
E?c                  z    SSK Jn   SSKJn  U " S5      n[	        U5      u  p4pV[        [        XTUS9U5      nU" US9$ )Nr   )find)ClassifierBasedPOSTaggerz.taggers/maxent_treebank_pos_tagger_tab/english)r   )rn  )	nltk.datar  nltk.tag.sequentialr  r  r   r   )r  r  r  r  r  r  r  mcs           r   maxent_pos_taggerr  I  sE    <CDG+G4Cc	#CDc
B $r22r!   c                  <    SSK Jn   U " [        R                  5      ng )Nr   )
names_demo)nltk.classify.utilr  r   r   )r  rn  s     r   demor  X  s    -,223Jr!   __main__)r   NN)r   NNr   )z/tmp)1r   r^  ImportErrorr  r  collectionsr   nltk.classify.apir   nltk.classify.megamr   r   r   nltk.classify.tadmr   r	   r
   r  r   r   r   r  r   nltk.probabilityr   	nltk.utilr   __docformat__r   rb  r   r   r   r  r/  r@  r   r]  rg  r   rz  r~  r   r   r  r  r  r  r   rJ   r!   r   <module>r     s:  ,Z	 
  # ) Q Q M M F F ' / !JA{ JA\ $4  F$ F$R,**@ ,*^G/"8 G/T:D- :Dz5/%@ 5/pa/!7 a/T 04_D
& 04Yx38{P KLT/x7&+ 7&~,+.	34 zF M1  		s   C" "C+*C+