o
    Zh                     @   s   d dl mZ d dlmZ d dlZd dlZddlmZm	Z	 ddl
mZ ddlmZmZ e	eZeed	d	d
G dd deZdS )    )UserDict)UnionN   )add_end_docstringslogging   )ffmpeg_read)Pipelinebuild_pipeline_init_argsT)Zhas_feature_extractorZhas_tokenizerc                       s`   e Zd ZdZ fddZdeejee	f f fddZ
dd ZdddZdd Zdd Z  ZS )#ZeroShotAudioClassificationPipelinea  
    Zero shot audio classification pipeline using `ClapModel`. This pipeline predicts the class of an audio when you
    provide an audio and a set of `candidate_labels`.

    <Tip warning={true}>

    The default `hypothesis_template` is : `"This is a sound of {}."`. Make sure you update it for your usage.

    </Tip>

    Example:
    ```python
    >>> from transformers import pipeline
    >>> from datasets import load_dataset

    >>> dataset = load_dataset("ashraq/esc50")
    >>> audio = next(iter(dataset["train"]["audio"]))["array"]
    >>> classifier = pipeline(task="zero-shot-audio-classification", model="laion/clap-htsat-unfused")
    >>> classifier(audio, candidate_labels=["Sound of a dog", "Sound of vaccum cleaner"])
    [{'score': 0.9996, 'label': 'Sound of a dog'}, {'score': 0.0004, 'label': 'Sound of vaccum cleaner'}]
    ```


    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial) This audio
    classification pipeline can currently be loaded from [`pipeline`] using the following task identifier:
    `"zero-shot-audio-classification"`. See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?filter=zero-shot-audio-classification).
    c                    s2   t  jdi | | jdkrtd| j dd S )NptzThe z is only available in PyTorch. )super__init__	framework
ValueError	__class__)selfkwargsr   r   d/var/www/auris/lib/python3.10/site-packages/transformers/pipelines/zero_shot_audio_classification.pyr   ?   s   
z,ZeroShotAudioClassificationPipeline.__init__audiosc                    s   t  j|fi |S )a  
        Assign labels to the audio(s) passed as inputs.

        Args:
            audios (`str`, `List[str]`, `np.array` or `List[np.array]`):
                The pipeline handles three types of inputs:
                - A string containing a http link pointing to an audio
                - A string containing a local path to an audio
                - An audio loaded in numpy
            candidate_labels (`List[str]`):
                The candidate labels for this audio. They will be formatted using *hypothesis_template*.
            hypothesis_template (`str`, *optional*, defaults to `"This is a sound of {}"`):
                The format used in conjunction with *candidate_labels* to attempt the audio classification by
                replacing the placeholder with the candidate_labels. Pass "{}" if *candidate_labels* are
                already formatted.
        Return:
            A list of dictionaries containing one entry per proposed label. Each dictionary contains the
            following keys:
            - **label** (`str`) -- One of the suggested *candidate_labels*.
            - **score** (`float`) -- The score attributed by the model to that label. It is a value between
                0 and 1, computed as the `softmax` of `logits_per_audio`.
        )r   __call__)r   r   r   r   r   r   r   F   s   z,ZeroShotAudioClassificationPipeline.__call__c                 K   s6   i }d|v r|d |d< d|v r|d |d< |i i fS )Ncandidate_labelshypothesis_templater   )r   r   Zpreprocess_paramsr   r   r   _sanitize_parameters_   s   
z8ZeroShotAudioClassificationPipeline._sanitize_parametersNThis is a sound of {}.c                    s  t |tr/|ds|drt|j}nt|d}| }W d    n1 s*w   Y  t |tr;t	|| j
j}t |tjsEtdt|jdkrPtd| j
|g| j
jdd}| jdkrf|| j}||d	<  fd
d|D }| j|| jdd}|g|d< |S )Nzhttp://zhttps://rbz"We expect a numpy ndarray as inputr   zNWe expect a single channel audio input for ZeroShotAudioClassificationPipeliner   )sampling_ratereturn_tensorsr   c                    s   g | ]}  |qS r   )format).0xr   r   r   
<listcomp>   s    zBZeroShotAudioClassificationPipeline.preprocess.<locals>.<listcomp>T)r   paddingtext_inputs)
isinstancestr
startswithrequestsgetcontentopenreadbytesr   Zfeature_extractorr   npndarray	TypeErrorlenshaper   r   toZtorch_dtypeZ	tokenizer)r   Zaudior   r   fZinputs	sequencesr&   r   r#   r   
preprocessh   s,   




z.ZeroShotAudioClassificationPipeline.preprocessc                 C   s\   | d}| d}t|d tr|d }n|d d }| jdi ||}||jd}|S )Nr   r&   r   )r   logitsr   )popr'   r   modelZlogits_per_audio)r   Zmodel_inputsr   r&   Zoutputsmodel_outputsr   r   r   _forward   s   


z,ZeroShotAudioClassificationPipeline._forwardc                 C   sb   | d}|d d }| jdkr|jdd}| }ntddd tt||d	d
 dD }|S )Nr   r9   r   r   )dimz`tf` framework not supported.c                 S   s   g | ]	\}}||d qS ))scorelabelr   )r!   r?   Zcandidate_labelr   r   r   r$      s    zCZeroShotAudioClassificationPipeline.postprocess.<locals>.<listcomp>c                 S   s
   | d  S )Nr   r   )r"   r   r   r   <lambda>   s   
 zAZeroShotAudioClassificationPipeline.postprocess.<locals>.<lambda>)key)r:   r   Zsoftmaxtolistr   sortedzip)r   r<   r   r9   ZprobsZscoresresultr   r   r   postprocess   s   


z/ZeroShotAudioClassificationPipeline.postprocess)Nr   )__name__
__module____qualname____doc__r   r   r0   r1   r/   r(   r   r   r8   r=   rG   __classcell__r   r   r   r   r       s    
	r   )collectionsr   typingr   numpyr0   r*   utilsr   r   Zaudio_classificationr   baser	   r
   Z
get_loggerrH   loggerr   r   r   r   r   <module>   s   
