o
    rZhx                     @   sT   d Z ddlZddlZddlmZ ddlmZmZmZ ddl	m
Z
 G dd deZdS )z{
A reader for corpora that consist of Tweets. It is assumed that the Tweets
have been serialised into line-delimited JSON.
    N)CorpusReader)StreamBackedCorpusViewZipFilePathPointerconcat)TweetTokenizerc                   @   sN   e Zd ZdZeZ	 de dfddZdddZddd	Z	dd
dZ
dd ZdS )TwitterCorpusReadera7  
    Reader for corpora that consist of Tweets represented as a list of line-delimited JSON.

    Individual Tweets can be tokenized using the default tokenizer, or by a
    custom tokenizer specified as a parameter to the constructor.

    Construct a new Tweet corpus reader for a set of documents
    located at the given root directory.

    If you made your own tweet collection in a directory called
    `twitter-files`, then you can initialise the reader as::

        from nltk.corpus import TwitterCorpusReader
        reader = TwitterCorpusReader(root='/path/to/twitter-files', '.*\.json')

    However, the recommended approach is to set the relevant directory as the
    value of the environmental variable `TWITTER`, and then invoke the reader
    as follows::

       root = os.environ['TWITTER']
       reader = TwitterCorpusReader(root, '.*\.json')

    If you want to work directly with the raw Tweets, the `json` library can
    be used::

       import json
       for tweet in reader.docs():
           print(json.dumps(tweet, indent=1, sort_keys=True))

    Nutf8c                 C   sZ   t | ||| | | jD ]}t|trqtj|dkr&t	d| dq	 || _
dS )a  
        :param root: The root directory for this corpus.
        :param fileids: A list or regexp specifying the fileids in this corpus.
        :param word_tokenizer: Tokenizer for breaking the text of Tweets into
            smaller units, including but not limited to words.
        r   zFile z	 is emptyN)r   __init__abspathsZ_fileids
isinstancer   ospathgetsize
ValueError_word_tokenizer)selfrootfileidsZword_tokenizerencodingr    r   I/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/twitter.pyr	   :   s   	

zTwitterCorpusReader.__init__c                    s    t  fdd |ddD S )a(  
        Returns the full Tweet objects, as specified by `Twitter
        documentation on Tweets
        <https://dev.twitter.com/docs/platform-objects/tweets>`_

        :return: the given file(s) as a list of dictionaries deserialised
            from JSON.
        :rtype: list(dict)
        c                    s$   g | ]\}}} j | j|d qS ))r   )
CorpusView_read_tweets).0r   encZfileidr   r   r   
<listcomp>Y   s    z,TwitterCorpusReader.docs.<locals>.<listcomp>T)r   r
   )r   r   r   r   r   docsN   s
   

zTwitterCorpusReader.docsc              	   C   sZ   |  |}g }|D ]!}z|d }t|tr|| j}|| W q	 ty*   Y q	w |S )z
        Returns only the text content of Tweets in the file(s)

        :return: the given file(s) as a list of Tweets.
        :rtype: list(str)
        text)r   r   bytesdecoder   appendKeyError)r   r   Z
fulltweetstweetsZjsonor   r   r   r   strings_   s   

zTwitterCorpusReader.stringsc                    s"   |  |}| j  fdd|D S )z
        :return: the given file(s) as a list of the text content of Tweets as
            as a list of words, screenanames, hashtags, URLs and punctuation symbols.

        :rtype: list(list(str))
        c                    s   g | ]}  |qS r   )tokenize)r   tZ	tokenizerr   r   r   {   s    z1TwitterCorpusReader.tokenized.<locals>.<listcomp>)r$   r   )r   r   r#   r   r'   r   	tokenizedr   s   
zTwitterCorpusReader.tokenizedc                 C   s>   g }t dD ]}| }|s|  S t|}|| q|S )zS
        Assumes that each line in ``stream`` is a JSON-serialised object.
        
   )rangereadlinejsonloadsr!   )r   streamr#   ilineZtweetr   r   r   r   }   s   
z TwitterCorpusReader._read_tweets)N)__name__
__module____qualname____doc__r   r   r   r	   r   r$   r(   r   r   r   r   r   r      s    



r   )r4   r,   r   Znltk.corpus.reader.apir   Znltk.corpus.reader.utilr   r   r   Znltk.tokenizer   r   r   r   r   r   <module>   s   