
    /h                         S r SSKrSSKJr  SSKJr  SSKJr  SSKJ	r	  \S 5       r
\
R                  \5      S 5       r\
R                  \5      S	 5       r " S
 S5      rg)zLanguage Model Vocabulary    N)Counter)Iterable)singledispatch)chainc                 0    [        S[        U 5       35      e)Nz/Unsupported type for looking up in vocabulary: )	TypeErrortypewordsvocabs     J/var/www/auris/envauris/lib/python3.13/site-packages/nltk/lm/vocabulary.py_dispatched_lookupr      s    
Ed5k]S
TT    c                 .   ^ [        U4S jU  5       5      $ )z[Look up a sequence of words in the vocabulary.

Returns an iterator over looked up words.

c              3   <   >#    U  H  n[        UT5      v   M     g 7fNr   ).0wr   s     r   	<genexpr>_.<locals>.<genexpr>   s     =u!#Au--us   )tupler
   s    `r   _r      s     =u===r   c                 (    X;   a  U $ UR                   $ )z$Looks up one word in the vocabulary.)	unk_label)wordr   s     r   _string_lookupr      s     =45eoo5r   c                   b    \ rS rSrSrSS jr\S 5       rS rS r	S r
S	 rS
 rS rS rS rSrg)
Vocabulary%   a	  Stores language model vocabulary.

Satisfies two common language modeling requirements for a vocabulary:

- When checking membership and calculating its size, filters items
  by comparing their counts to a cutoff value.
- Adds a special "unknown" token which unseen words are mapped to.

>>> words = ['a', 'c', '-', 'd', 'c', 'a', 'b', 'r', 'a', 'c', 'd']
>>> from nltk.lm import Vocabulary
>>> vocab = Vocabulary(words, unk_cutoff=2)

Tokens with counts greater than or equal to the cutoff value will
be considered part of the vocabulary.

>>> vocab['c']
3
>>> 'c' in vocab
True
>>> vocab['d']
2
>>> 'd' in vocab
True

Tokens with frequency counts less than the cutoff value will be considered not
part of the vocabulary even though their entries in the count dictionary are
preserved.

>>> vocab['b']
1
>>> 'b' in vocab
False
>>> vocab['aliens']
0
>>> 'aliens' in vocab
False

Keeping the count entries for seen words allows us to change the cutoff value
without having to recalculate the counts.

>>> vocab2 = Vocabulary(vocab.counts, unk_cutoff=1)
>>> "b" in vocab2
True

The cutoff value influences not only membership checking but also the result of
getting the size of the vocabulary using the built-in `len`.
Note that while the number of keys in the vocabulary's counter stays the same,
the items in the vocabulary differ depending on the cutoff.
We use `sorted` to demonstrate because it keeps the order consistent.

>>> sorted(vocab2.counts)
['-', 'a', 'b', 'c', 'd', 'r']
>>> sorted(vocab2)
['-', '<UNK>', 'a', 'b', 'c', 'd', 'r']
>>> sorted(vocab.counts)
['-', 'a', 'b', 'c', 'd', 'r']
>>> sorted(vocab)
['<UNK>', 'a', 'c', 'd']

In addition to items it gets populated with, the vocabulary stores a special
token that stands in for so-called "unknown" items. By default it's "<UNK>".

>>> "<UNK>" in vocab
True

We can look up words in a vocabulary using its `lookup` method.
"Unseen" words (with counts less than cutoff) are looked up as the unknown label.
If given one word (a string) as an input, this method will return a string.

>>> vocab.lookup("a")
'a'
>>> vocab.lookup("aliens")
'<UNK>'

If given a sequence, it will return an tuple of the looked up words.

>>> vocab.lookup(["p", 'a', 'r', 'd', 'b', 'c'])
('<UNK>', 'a', '<UNK>', 'd', '<UNK>', 'c')

It's possible to update the counts after the vocabulary has been created.
In general, the interface is the same as that of `collections.Counter`.

>>> vocab['b']
1
>>> vocab.update(["b", "b", "c"])
>>> vocab['b']
3
Nc                     X0l         US:  a  [        SU 35      eX l        [        5       U l        U R                  Ub  U5        gS5        g)a  Create a new Vocabulary.

:param counts: Optional iterable or `collections.Counter` instance to
               pre-seed the Vocabulary. In case it is iterable, counts
               are calculated.
:param int unk_cutoff: Words that occur less frequently than this value
                       are not considered part of the vocabulary.
:param unk_label: Label for marking words not part of vocabulary.

   z)Cutoff value cannot be less than 1. Got: N )r   
ValueError_cutoffr   countsupdate)selfr&   
unk_cutoffr   s       r   __init__Vocabulary.__init__   sG     #>HUVV!if0F9b9r   c                     U R                   $ )zYCutoff value.

Items with count below this value are not considered part of vocabulary.

)r%   r(   s    r   cutoffVocabulary.cutoff   s     ||r   c                 j    U R                   R                  " U0 UD6  [        S U  5       5      U l        g)zGUpdate vocabulary counts.

Wraps `collections.Counter.update` method.

c              3   &   #    U  H  nS v   M	     g7f)r"   N )r   r   s     r   r   $Vocabulary.update.<locals>.<genexpr>   s     (4a4s   N)r&   r'   sum_len)r(   counter_argscounter_kwargss      r   r'   Vocabulary.update   s-     	L;N;(4((	r   c                     [        X5      $ )a  Look up one or more words in the vocabulary.

If passed one word as a string will return that word or `self.unk_label`.
Otherwise will assume it was passed a sequence of words, will try to look
each of them up and return an iterator over the looked up words.

:param words: Word(s) to look up.
:type words: Iterable(str) or str
:rtype: generator(str) or str
:raises: TypeError for types other than strings or iterables

>>> from nltk.lm import Vocabulary
>>> vocab = Vocabulary(["a", "b", "c", "a", "b"], unk_cutoff=2)
>>> vocab.lookup("a")
'a'
>>> vocab.lookup("aliens")
'<UNK>'
>>> vocab.lookup(["a", "b", "c", ["x", "b"]])
('a', 'b', '<UNK>', ('<UNK>', 'b'))

r   )r(   r   s     r   lookupVocabulary.lookup   s    , "%..r   c                 V    XR                   :X  a  U R                  $ U R                  U   $ r   )r   r%   r&   r(   items     r   __getitem__Vocabulary.__getitem__   s#    #~~5t||L4;;t;LLr   c                 $    X   U R                   :  $ )zHOnly consider items with counts GE to cutoff as being in the
vocabulary.)r.   r=   s     r   __contains__Vocabulary.__contains__   s     zT[[((r   c                    ^  [        U 4S jT R                   5       T R                  (       a  T R                  /5      $ / 5      $ )zCBuilding on membership check define how to iterate over
vocabulary.c              3   6   >#    U  H  oT;   d  M
  Uv   M     g 7fr   r2   )r   r>   r(   s     r   r   &Vocabulary.__iter__.<locals>.<genexpr>   s     :kdT\TTks   		)r   r&   r   r-   s   `r   __iter__Vocabulary.__iter__   s;     :dkk: $T^^
 	
13
 	
r   c                     U R                   $ )z1Computing size of vocabulary reflects the cutoff.)r5   r-   s    r   __len__Vocabulary.__len__   s    yyr   c                     U R                   UR                   :H  =(       a9    U R                  UR                  :H  =(       a    U R                  UR                  :H  $ r   )r   r.   r&   )r(   others     r   __eq__Vocabulary.__eq__   sA    NNeoo- ,u||+,u||+	
r   c                     SR                  U R                  R                  U R                  U R                  [        U 5      5      $ )Nz/<{} with cutoff={} unk_label='{}' and {} items>)format	__class____name__r.   r   lenr-   s    r   __str__Vocabulary.__str__   s4    @GGNN##T[[$..#d)
 	
r   )r%   r5   r&   r   )Nr"   z<UNK>)rS   
__module____qualname____firstlineno____doc__r*   propertyr.   r'   r:   r?   rB   rG   rJ   rN   rU   __static_attributes__r2   r   r   r   r   %   sK    Wr:&  )/0M)



r   r   )rZ   syscollectionsr   collections.abcr   	functoolsr   	itertoolsr   r   registerr   strr   r   r2   r   r   <module>rd      sy      
  $ $  U U X&> '> S!6 "6
u
 u
r   