o
    rZh.                     @   st   d Z ddlZddlT ddlT edZedZedZedZG dd	 d	Z	G d
d dZ
G dd deZdS )a	  
CorpusReader for reviews corpora (syntax based on Customer Review Corpus).

Customer Review Corpus information
==================================

Annotated by: Minqing Hu and Bing Liu, 2004.
    Department of Computer Science
    University of Illinois at Chicago

Contact: Bing Liu, liub@cs.uic.edu
        https://www.cs.uic.edu/~liub

Distributed with permission.

The "product_reviews_1" and "product_reviews_2" datasets respectively contain
annotated customer reviews of 5 and 9 products from amazon.com.

Related papers:

- Minqing Hu and Bing Liu. "Mining and summarizing customer reviews".
    Proceedings of the ACM SIGKDD International Conference on Knowledge
    Discovery & Data Mining (KDD-04), 2004.

- Minqing Hu and Bing Liu. "Mining Opinion Features in Customer Reviews".
    Proceedings of Nineteeth National Conference on Artificial Intelligence
    (AAAI-2004), 2004.

- Xiaowen Ding, Bing Liu and Philip S. Yu. "A Holistic Lexicon-Based Appraoch to
    Opinion Mining." Proceedings of First ACM International Conference on Web
    Search and Data Mining (WSDM-2008), Feb 11-12, 2008, Stanford University,
    Stanford, California, USA.

Symbols used in the annotated reviews:

    :[t]: the title of the review: Each [t] tag starts a review.
    :xxxx[+|-n]: xxxx is a product feature.
    :[+n]: Positive opinion, n is the opinion strength: 3 strongest, and 1 weakest.
           Note that the strength is quite subjective.
           You may want ignore it, but only considering + and -
    :[-n]: Negative opinion
    :##:   start of each sentence. Each line is a sentence.
    :[u]:  feature not appeared in the sentence.
    :[p]:  feature not appeared in the sentence. Pronoun resolution is needed.
    :[s]:  suggestion or recommendation.
    :[cc]: comparison with a competing product from a different brand.
    :[cs]: comparison with a competing product from the same brand.

Note: Some of the files (e.g. "ipod.txt", "Canon PowerShot SD500.txt") do not
    provide separation between different reviews. This is due to the fact that
    the dataset was specifically designed for aspect/feature-based sentiment
    analysis, for which sentence-level annotation is sufficient. For document-
    level classification and analysis, this peculiarity should be taken into
    consideration.
    N)*z^\[t\](.*)$z%((?:(?:\w+\s)+)?\w+)\[((?:\+|\-)\d)\]z\[(?!t)(p|u|s|cc|cs)\]z##(.*)$c                   @   s:   e Zd ZdZdddZdd Zdd Zd	d
 Zdd ZdS )Reviewz>
    A Review is the main block of a ReviewsCorpusReader.
    Nc                 C   s"   || _ |du rg | _dS || _dS )z
        :param title: the title of the review.
        :param review_lines: the list of the ReviewLines that belong to the Review.
        N)titlereview_lines)selfr   r    r   I/var/www/auris/lib/python3.10/site-packages/nltk/corpus/reader/reviews.py__init__R   s   

zReview.__init__c                 C   s   t |tsJ | j| dS )z
        Add a line (ReviewLine) to the review.

        :param review_line: a ReviewLine instance that belongs to the Review.
        N)
isinstance
ReviewLiner   append)r   review_liner   r   r   add_line]   s   zReview.add_linec                 C   s    g }| j D ]}||j q|S )a  
        Return a list of features in the review. Each feature is a tuple made of
        the specific item feature and the opinion strength about that feature.

        :return: all features of the review as a list of tuples (feat, score).
        :rtype: list(tuple)
        )r   extendfeatures)r   r   r   r   r   r   r   f   s   
zReview.featuresc                 C   s   dd | j D S )z
        Return all tokenized sentences in the review.

        :return: all sentences of the review as lists of tokens.
        :rtype: list(list(str))
        c                 S   s   g | ]}|j qS r   )sent).0r   r   r   r   
<listcomp>z   s    z Review.sents.<locals>.<listcomp>)r   r   r   r   r   sentss   s   zReview.sentsc                 C   s   d | j| jS )Nz#Review(title="{}", review_lines={}))formatr   r   r   r   r   r   __repr__|   s   zReview.__repr__NN)	__name__
__module____qualname____doc__r	   r   r   r   r   r   r   r   r   r   M   s    
		r   c                   @   s"   e Zd ZdZdddZdd ZdS )r   z
    A ReviewLine represents a sentence of the review, together with (optional)
    annotations of its features and notes about the reviewed item.
    Nc                 C   s8   || _ |d u rg | _n|| _|d u rg | _d S || _d S Nr   r   notes)r   r   r   r   r   r   r   r	      s   

zReviewLine.__init__c                 C   s   d | j| j| jS )Nz*ReviewLine(features={}, notes={}, sent={}))r   r   r   r   r   r   r   r   r      s   zReviewLine.__repr__r   )r   r   r   r   r	   r   r   r   r   r   r      s    
r   c                   @   sl   e Zd ZdZeZe dfddZdddZddd	Z	dd
dZ
dddZdd Zdd Zdd Zdd ZdS )ReviewsCorpusReadera  
    Reader for the Customer Review Data dataset by Hu, Liu (2004).
    Note: we are not applying any sentence tokenization at the moment, just word
    tokenization.

        >>> from nltk.corpus import product_reviews_1
        >>> camera_reviews = product_reviews_1.reviews('Canon_G3.txt')
        >>> review = camera_reviews[0]
        >>> review.sents()[0] # doctest: +NORMALIZE_WHITESPACE
        ['i', 'recently', 'purchased', 'the', 'canon', 'powershot', 'g3', 'and', 'am',
        'extremely', 'satisfied', 'with', 'the', 'purchase', '.']
        >>> review.features() # doctest: +NORMALIZE_WHITESPACE
        [('canon powershot g3', '+3'), ('use', '+2'), ('picture', '+2'),
        ('picture quality', '+1'), ('picture quality', '+1'), ('camera', '+2'),
        ('use', '+2'), ('feature', '+1'), ('picture quality', '+3'), ('use', '+1'),
        ('option', '+1')]

    We can also reach the same information directly from the stream:

        >>> product_reviews_1.features('Canon_G3.txt')
        [('canon powershot g3', '+3'), ('use', '+2'), ...]

    We can compute stats for specific product features:

        >>> n_reviews = len([(feat,score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> tot = sum([int(score) for (feat,score) in product_reviews_1.features('Canon_G3.txt') if feat=='picture'])
        >>> mean = tot / n_reviews
        >>> print(n_reviews, tot, mean)
        15 24 1.6
    utf8c                 C   s    t | ||| || _d| _dS )ad  
        :param root: The root directory for the corpus.
        :param fileids: a list or regexp specifying the fileids in the corpus.
        :param word_tokenizer: a tokenizer for breaking sentences or paragraphs
            into words. Default: `WordPunctTokenizer`
        :param encoding: the encoding that should be used to read the corpus.
        z
README.txtN)CorpusReaderr	   _word_tokenizerZ_readme)r   rootfileidsZword_tokenizerencodingr   r   r   r	      s   
zReviewsCorpusReader.__init__Nc                    s>   |du r j }nt|tr|g}t fdd |dD S )au  
        Return a list of features. Each feature is a tuple made of the specific
        item feature and the opinion strength about that feature.

        :param fileids: a list or regexp specifying the ids of the files whose
            features have to be returned.
        :return: all features for the item(s) in the given file(s).
        :rtype: list(tuple)
        Nc                    "   g | ]\}} j | j|d qS )r&   )
CorpusView_read_featuresr   fileidencr   r   r   r          z0ReviewsCorpusReader.features.<locals>.<listcomp>T)_fileidsr
   strconcatabspathsr   r%   r   r   r   r      s   



zReviewsCorpusReader.featuresc                    s,   |du r j }t fdd |dD S )aS  
        Return all the reviews as a list of Review objects. If `fileids` is
        specified, return all the reviews from each of the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            reviews have to be returned.
        :return: the given file(s) as a list of reviews.
        Nc                    r'   r(   )r)   _read_review_blockr+   r   r   r   r      r.   z/ReviewsCorpusReader.reviews.<locals>.<listcomp>T)r/   r1   r2   r3   r   r   r   reviews   s   	

zReviewsCorpusReader.reviewsc                        t  fdd |ddD S )aY  
        Return all sentences in the corpus or in the specified files.

        :param fileids: a list or regexp specifying the ids of the files whose
            sentences have to be returned.
        :return: the given file(s) as a list of sentences, each encoded as a
            list of word strings.
        :rtype: list(list(str))
        c                    $   g | ]\}}} j | j|d qS r(   )r)   _read_sent_blockr   pathr-   r,   r   r   r   r          z-ReviewsCorpusReader.sents.<locals>.<listcomp>Tr1   r2   r3   r   r   r   r      
   

zReviewsCorpusReader.sentsc                    r6   )aK  
        Return all words and punctuation symbols in the corpus or in the specified
        files.

        :param fileids: a list or regexp specifying the ids of the files whose
            words have to be returned.
        :return: the given file(s) as a list of words and punctuation symbols.
        :rtype: list(str)
        c                    r7   r(   )r)   _read_word_blockr9   r   r   r   r     r;   z-ReviewsCorpusReader.words.<locals>.<listcomp>Tr<   r3   r   r   r   words  r=   zReviewsCorpusReader.wordsc                 C   s<   g }t dD ]}| }|s|  S |tt| q|S )N   )rangereadliner   refindallFEATURES)r   streamr   iliner   r   r   r*     s   z"ReviewsCorpusReader._read_featuresc           
      C   s   	 |  }|s	g S tt|}|rt|d d}nq	 | }|  }|s+|gS tt|r9|| |gS t	t
|}t	t|}t	t|}|rU| j|d }t|||d}	||	 q)NT   )r   r   r   )rB   rC   matchTITLEr   groupstriptellseekrD   rE   NOTESSENTr#   tokenizer   r   )
r   rF   rH   Ztitle_matchreviewoldposZfeatsr   r   r   r   r   r   r4     s6   

z&ReviewsCorpusReader._read_review_blockc                 C   s0   g }|  |D ]}|dd | D  q|S )Nc                 S   s   g | ]}|qS r   r   )r   r   r   r   r   r   A  s    z8ReviewsCorpusReader._read_sent_block.<locals>.<listcomp>)r4   r   r   )r   rF   r   rS   r   r   r   r8   >  s   z$ReviewsCorpusReader._read_sent_blockc                 C   sD   g }t dD ]}| }tt|}|r|| j|d  q|S )Nr@   r   )rA   rB   rC   rD   rQ   r   r#   rR   )r   rF   r?   rG   rH   r   r   r   r   r>   D  s   z$ReviewsCorpusReader._read_word_blockr   )r   r   r   r   ZStreamBackedCorpusViewr)   ZWordPunctTokenizerr	   r   r5   r   r?   r*   r4   r8   r>   r   r   r   r   r       s    




	!r    )r   rC   Znltk.corpus.reader.apiZnltk.tokenizecompilerK   rE   rP   rQ   r   r   r"   r    r   r   r   r   <module>   s   8


5