
    2h؉                       % S SK Jr  S SKrS SKrS SKJrJrJrJrJ	r	  S SK
Jr  SSKJr   " S S\	5      r\" S	S
S9rS*S jrS+S jrSS/4S,S jjrS-S.S jjrSS/4S,S jjrS*S jrS*S jrS*S jrS*S jrS*S jrS*S jrS/S jrSS/4     S0S jjrS*S jrS1S jrS2S jr         S3S jr!S4S jr"S5S  jr#S6S! jr$S7S" jr%S8S# jr&S9S:S$ jjr'S%r(S&\)S''   S;S( jr*              S<S) jr+g)=    )annotationsN)AnyTypeVarCallableOptional
NamedTuple)	TypeAlias   )pandasc                  t    \ rS rSr% S\S'   SrS\S'   SrS\S'   SrS\S	'   SrS\S
'   Sr	S\S'   Sr
S\S'   Srg)Remediation   strnameNzOptional[str]immediate_msgnecessary_msgzOptional[Callable[[Any], Any]]necessary_fnoptional_msgoptional_fn	error_msg )__name__
__module____qualname____firstlineno____annotations__r   r   r   r   r   r   __static_attributes__r       N/var/www/auris/envauris/lib/python3.13/site-packages/openai/lib/_validators.pyr   r      sE    
I#'M='#'M='37L07"&L-&26K/6#I}#r   r   OptionalDataFrameTzOptional[pd.DataFrame])boundc                b    Sn[        U 5      U:  a  SOSnS[        U 5       SU 3n[        SUS9$ )z
This validator will only print out the number of examples and recommend to the user to increase the number of examples if less than 100.
d    z. In general, we recommend having at least a few hundred examples. We've found that performance tends to linearly increase for every doubling of the number of examplesz
- Your file contains z prompt-completion pairsnum_examplesr   r   )lenr   )dfMIN_EXAMPLESoptional_suggestionr   s       r   num_examples_validatorr+      sO     L r7l" 	 w 
 .c"gY6NObNcdMN-HHr   c                  ^^ SS jmSnSnSnSnTU R                   ;  aV  TU R                    Vs/ s H  n[        U5      R                  5       PM     sn;   a  SUU4S jjnUnST S3nST S3nOST S3n[        S	UUUUS
9$ s  snf )zS
This validator will ensure that the necessary column is present in the dataframe.
c                    U R                    Vs/ s H$  n[        U5      R                  5       U:X  d  M"  UPM&     nnU R                  US   UR                  5       0SS9  U $ s  snf )Nr   T)columnsinplace)r.   r   lowerrename)r(   columnccolss       r   lower_case_column5necessary_column_validator.<locals>.lower_case_column,   sU    ::B:aQ6)A:B
		47FLLN3T	B	 Cs
   !A!A!Nc                   > T" U T5      $ Nr   )r(   r5   necessary_columns    r   lower_case_column_creator=necessary_column_validator.<locals>.lower_case_column_creator9   s    (-=>>r   z
- The `z ` column/key should be lowercasezLower case column name to ``z^` column/key is missing. Please make sure you name your columns/keys appropriately, then retryr9   )r   r   r   r   r   )r(   pd.DataFramer2   r   returnr=   )r(   r=   r>   r=   )r.   r   r0   r   )	r(   r9   r   r   r   r   r3   r:   r5   s	    `      @r   necessary_column_validatorr?   '   s    

 MLMIrzz)

C
1A
CC? ? 5L'(8'99YZM9:J9K1MM,-  .L  MI##!   Ds   #Bprompt
completionc                b  ^ / nSnSnSn[        U R                  5      S:  ax  U R                   Vs/ s H  ofT;  d  M
  UPM     nnSnU H9  nU Vs/ s H  ohU;   d  M
  UPM     n	n[        U	5      S:  d  M-  USU SU S3-  nM;     SU U 3nS	U 3nSU4S
 jjn[        SUUUS9$ s  snf s  snf )zC
This validator will remove additional columns from the dataframe.
Nr
   r$   r   z9
  WARNING: Some of the additional columns/keys contain `z<` in their name. These will be ignored, and the column/key `z`` will be used instead. This could also result from a duplicate column/key in the provided file.zh
- The input file should contain exactly two columns/keys per row. Additional columns/keys present are: z Remove additional columns/keys: c                   > U T   $ r8   r   xfieldss    r   r   1additional_column_validator.<locals>.necessary_fn^   s    V9r   additional_columnr   r   r   r   rE   r   r>   r   )r'   r.   r   )
r(   rF   additional_columnsr   r   r   r3   warn_messageacdupss
    `        r   additional_column_validatorrO   K   s    MML
2::)+GAaG$B1=1!1WA1D=4y1}"\]_\`  a]  ^`  ]a  aA  !B  B % D  EW  DX  Ye  Xf  g:;M:NO	  ##!	  H >s   	B'B'	B,B,c                  ^ SnSnSnU T   R                  S 5      R                  5       (       d&  U T   R                  5       R                  5       (       ai  U T   S:H  U T   R                  5       -  nU R                  5       R                  U   R                  5       nST SU 3nSU4S jjnS[        U5       ST S	3n[        S
T 3UUUS9$ )z9
This validator will ensure that no completion is empty.
Nc                    U S:H  $ )Nr$   r   rE   s    r   <lambda>+non_empty_field_validator.<locals>.<lambda>q   s    br   r$   z
- `z?` column/key should not contain empty strings. These are rows: c                4   > X T   S:g     R                  T/S9$ )Nr$   subset)dropna)rE   fields    r   r   /non_empty_field_validator.<locals>.necessary_fnv   s$    uX^$++E7+;;r   Remove z rows with empty sempty_rI   rJ   )applyanyisnullreset_indexindextolistr'   r   )r(   rY   r   r   r   
empty_rowsempty_indexess    `     r   non_empty_field_validatorrf   i   s     MLM	%y()--//2e93C3C3E3I3I3K3Ki2o"U)*:*:*<=
(..z:AACw&efsetu	< "#m"4!55FugQOeW##!	 r   c                .  ^ U R                  TS9nU R                  5       R                  U   R                  5       nSnSnSn[	        U5      S:  a:  S[	        U5       SSR                  T5       SU 3nS[	        U5       S	3nSU4S
 jjn[        SUUUS9$ )zQ
This validator will suggest to the user to remove duplicate rows if they exist.
rV   Nr   
- There are z duplicated -z sets. These are rows: r[   z duplicate rowsc                "   > U R                  TS9$ )NrV   )drop_duplicatesrD   s    r   r   .duplicated_rows_validator.<locals>.optional_fn   s    $$F$33r   duplicated_rowsr   r   r   r   rJ   )
duplicatedra   rb   rc   r'   joinr   )r(   rF   rm   duplicated_indexesr   r   r   s    `     r   duplicated_rows_validatorrr      s     mm6m2O)//@GGIMLK
"(-?)@(AchhW]N^M__v  xJ  wK  L %7!8 9I	4 #!	 r   c                   ^^ SnSnSn[        U 5      nUS:w  aF  SS jmT" U 5      m[        T5      S:  a*  S[        T5       ST S3nS[        T5       S	3nSUU4S
 jjn[        SUUUS9$ )zO
This validator will suggest to the user to remove examples that are too long.
Nopen-ended generationc                z    U R                  S SS9nU R                  5       R                  U   R                  5       $ )Nc                ^    [        U R                  5      [        U R                  5      -   S:  $ )Ni'  )r'   r@   rA   rR   s    r   rS   Clong_examples_validator.<locals>.get_long_indexes.<locals>.<lambda>   s     c!((mc!,,>O.ORW.Wr      )axis)r^   ra   rb   rc   )dlong_exampless     r   get_long_indexes1long_examples_validator.<locals>.get_long_indexes   s6    GG$W^_G`M==?((7>>@@r   r   rh   z. examples that are very long. These are rows: zf
For conditional generation, and for classification the examples shouldn't be longer than 2048 tokens.r[   z long examplesc                   > T" U 5      nTU:w  a/  [         R                  R                  S[        U5       SU S35        U R	                  U5      $ )NzeThe indices of the long examples has changed as a result of a previously applied recommendation.
The z? long examples to be dropped are now at the following indices: 
)sysstdoutwriter'   drop)rE   long_indexes_to_dropr|   long_indexess     r   r   ,long_examples_validator.<locals>.optional_fn   s    '7':$#77JJ$$ A  BE  FZ  B[  A\  \[  \p  [q  qs  t vv233r   r{   rn   )rz   r=   r>   r   rJ   )infer_task_typer'   r   )r(   r   r   r   ft_typer|   r   s        @@r   long_examples_validatorr      s     MLKb!G))	A (+|q ,S->,??mnzm{  |c  dM$S%6$7~FL4 4 #!	 r   c                |  ^^ SnSnSnSnSm/ SQnU H~  nUS:X  a:  U R                   R                  R                  S5      R                  5       (       a  MC  U R                   R                  R                  USS9R                  5       (       a  M|  Um  O   TR	                  SS5      n[        U 5      nUS	:X  a	  [        S
S9$ SS jm[        U R                   SS9n	U R                   U	:H  R                  5       (       a  SU	 S3n[        S
US9$ U	S:w  a  U	R	                  SS5      n
SU
 S3n[        U	5      S:  a	  USU S3-  nU R                   R                  S[        U	5      *  R                  R                  U	SS9R                  5       (       a	  USU	 S3-  nOSnU	S:X  a  SU S3nS UU4S jjn[        SUUUUS9$ )!z
This validator will suggest to add a common suffix to the prompt if one doesn't already exist in case of classification or conditional generation.
Nz


### =>

) ->z

###

z

===

z

---

z

===>

z

--->

r   r   Fregex\nrt   common_suffixr   suffixc                     U S==   U-  ss'   U $ Nr@   r   rE   r   s     r   
add_suffix2common_prompt_suffix_validator.<locals>.add_suffix   s    	(vr   xfixzAll prompts are identical: `zt`
Consider leaving the prompts blank if you want to do open-ended generation, otherwise ensure prompts are differentr   r   r$   z 
- All prompts end with suffix `r<   
   R. This suffix seems very long. Consider replacing with a shorter suffix, such as `z5
  WARNING: Some of your prompts contain the suffix `zZ` more than once. We strongly suggest that you review your prompts and add a unique suffixa  
- Your data does not contain a common separator at the end of your prompts. Having a separator string appended to the end of the prompt makes it clearer to the fine-tuned model where the completion should begin. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples. If you intend to do open-ended generation, then you should leave the prompts emptyzAdd a suffix separator `z` to all promptsc                   > T" U T5      $ r8   r   rE   r   suggested_suffixs    r   r   3common_prompt_suffix_validator.<locals>.optional_fn       a!122r   common_completion_suffixr   r   r   r   r   rE   r   r   r   r>   r   rJ   )
r@   r   containsr_   replacer   r   get_common_xfixallr'   )r(   r   r   r   r   suffix_optionssuffix_optiondisplay_suggested_suffixr   r   common_suffix_new_line_handledr   r   s              @@r   common_prompt_suffix_validatorr      s$    IMLK (N (E!yy}}%%d+//1199==!!-u!=AACC( (  077eDb!G))00 $BIIH=M
		]"''))2=/  Bw  x	9EE)6)>)>tU)K&;<Z;[[\]}"q  sK  rL  LM  N  NM99==.C../33<<]RW<X\\^^UVcUd  e  @  @M p12J1KK[\	3 	3 '#! r   c                2  ^^ SnSnSnSn[        U R                  SS9mTS:X  a	  [        SS9$ SS jmU R                  T:H  R                  5       (       a	  [        SS9$ TS:w  a)  S	T S
3nU[	        T5      :  a  US-  nST S3nSUU4S jjn[        SUUUS9$ )z\
This validator will suggest to remove a common prefix from the prompt if a long one exist.
r   Nprefixr   r$   common_prefixr   c                B    U S   R                   [        U5      S  U S'   U $ r   r   r'   )rE   r   s     r   remove_common_prefix<common_prompt_prefix_validator.<locals>.remove_common_prefix  s#    kooc&km4(r   z"
- All prompts start with prefix `r<   z. Fine-tuning doesn't require the instruction specifying the task, or a few-shot example scenario. Most of the time you should only add the input data into the prompt, and the desired output into the completionRemove prefix `z` from all promptsc                   > T" U T5      $ r8   r   )rE   r   r   s    r   r   3common_prompt_prefix_validator.<locals>.optional_fn!  s    +A}==r   common_prompt_prefixrn   )rE   r   r   r   r>   r   rJ   )r   r@   r   r   r'   )r(   MAX_PREFIX_LENr   r   r   r   r   s        @@r   common_prompt_prefix_validatorr     s     NMLK#BIIH=M00 			]"''))00=m_ANC..  r  rM,]O;MNL> > ##!	 r   c                B  ^^^ Sn[        U R                  SS9m[        T5      S:  =(       a    TS   S:H  m[        T5      U:  a	  [        SS9$ SS jmU R                  T:H  R	                  5       (       a	  [        SS9$ S	T S
3nST S3nSUUU4S jjn[        SUUUS9$ )z`
This validator will suggest to remove a common prefix from the completion if a long one exist.
   r   r   r    r   r   c                f    U S   R                   [        U5      S  U S'   U(       a  SU S    3U S'   U $ )NrA   r   r   )rE   r   	ws_prefixs      r   r   @common_completion_prefix_validator.<locals>.remove_common_prefix7  s=    L/--c&km<, !!L/!23AlOr   z&
- All completions start with prefix `z_`. Most of the time you should only add the output data into the completion, without any prefixr   z` from all completionsc                   > T" U TT5      $ r8   r   )rE   r   r   r   s    r   r   7common_completion_prefix_validator.<locals>.optional_fnE  s    #A}i@@r   common_completion_prefixrn   )rE   r   r   r   r   r   r>   r   rJ   )r   rA   r'   r   r   )r(   r   r   r   r   r   r   r   s        @@@r   "common_completion_prefix_validatorr   ,  s     N#BMMAMM"Q&B=+;s+BI
=N*00 	&++--00=m_  Ml  mM$]O3IJLA A '#!	 r   c                  ^^ SnSnSnSn[        U 5      nUS:X  d  US:X  a	  [        SS9$ [        U R                  SS9nU R                  U:H  R	                  5       (       a  SU S	U S
3n[        SUS9$ Sm/ SQnU H>  nU R                  R
                  R                  USS9R                  5       (       a  M<  Um  O   TR                  SS5      n	SS jmUS:w  a  UR                  SS5      n
SU
 S
3n[        U5      S:  a	  USU	 S
3-  nU R                  R
                  S[        U5      *  R
                  R                  USS9R                  5       (       a	  USU S3-  nOSnUS:X  a  SU	 S3nS UU4S jjn[        SUUUUS9$ )!z
This validator will suggest to add a common suffix to the completion if one doesn't already exist in case of classification or conditional generation.
Nrt   classificationr   r   r   r   z All completions are identical: `zJ`
Ensure completions are different, otherwise the model will just repeat `r<   r   z [END])	r   .z ENDz***z+++z&&&z$$$z@@@z%%%Fr   r   r   c                     U S==   U-  ss'   U $ NrA   r   r   s     r   r   6common_completion_suffix_validator.<locals>.add_suffixv  s    	,6!r   r$   z$
- All completions end with suffix `r   r   z9
  WARNING: Some of your completions contain the suffix `zU` more than once. We suggest that you review your completions and add a unique endingaH  
- Your data does not contain a common ending at the end of your completions. Having a common ending string appended to the end of the completion makes it clearer to the fine-tuned model where the completion should end. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detail and examples.zAdd a suffix ending `z` to all completionsc                   > T" U T5      $ r8   r   r   s    r   r   7common_completion_suffix_validator.<locals>.optional_fn  r   r   r   r   r   rJ   )
r   r   r   rA   r   r   r   r_   r   r'   )r(   r   r   r   r   r   r   r   r   r   r   r   r   s              @@r   "common_completion_suffix_validatorr   P  s    IMLKb!G))W8H-H00#BMMAM
&++--6}o  FQ  R_  Q`  `a  b	9EE  
N (==%%m5%AEEGG(	 (
  077eD )6)>)>tU)K&?@^?__`a}"q  sK  rL  LM  N  NM==2M 22377@@V[@\``bbYZgYh  i~    M d./G.HH\]	3 	3 '#! r   c                    S
S jnSnSnSnU R                   R                  SS R                  5       S:w  d   U R                   R                  S   S   S:w  a  SnSnUn[	        SUUUS	9$ )z
This validator will suggest to add a space at the start of the completion if it doesn't already exist. This helps with tokenization.
c                6    U S   R                  S 5      U S'   U $ )NrA   c                B    U R                  S5      (       a  SU -   $ SU -   $ )Nr   r$   )
startswith)r\   s    r   rS   Lcompletions_space_start_validator.<locals>.add_space_start.<locals>.<lambda>  s!    cARAR2_`:`X[_`:`r   )r^   rR   s    r   add_space_start:completions_space_start_validator.<locals>.add_space_start  s     L///0`a,r   Nrx   r   r   z
- The completion should start with a whitespace character (` `). This tends to produce better results due to the tokenization we use. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz=Add a whitespace character to the beginning of the completioncompletion_space_startrn   rJ   )rA   r   nuniquevaluesr   )r(   r   r   r   r   s        r   !completions_space_start_validatorr     s    
 LKM	}}!$$&!+r}}/C/CA/Fq/IS/P BV%%#!	 r   c                   ^ SU4S jjnU T   R                  S 5      R                  5       nU T   R                  S 5      R                  5       nUS-  U:  a  [        SST ST S3S	T S
3US9$ g)zl
This validator will suggest to lowercase the column values, if more than a third of letters are uppercase.
c                H   > U T   R                   R                  5       U T'   U $ r8   )r   r0   )rE   r2   s    r   
lower_case(lower_case_validator.<locals>.lower_case  s"    fIMM'')&	r   c                &    [        S U  5       5      $ )Nc              3     #    U  H4  oR                  5       (       d  M  UR                  5       (       d  M0  S v   M6     g7frx   N)isalphaisupper.0r3   s     r   	<genexpr>9lower_case_validator.<locals>.<lambda>.<locals>.<genexpr>  &     0]AqQRQZQZQ\A   >>	>sumrR   s    r   rS   &lower_case_validator.<locals>.<lambda>      S0]A0]-]r   c                &    [        S U  5       5      $ )Nc              3     #    U  H4  oR                  5       (       d  M  UR                  5       (       d  M0  S v   M6     g7fr   )r   islowerr   s     r   r   r     r   r   r   rR   s    r   rS   r     r   r   r
   r   z
- More than a third of your `z%` column/key is uppercase. Uppercase zs tends to perform worse than a mixture of case encountered in normal language. We recommend to lower case the data if that makes sense in your domain. See https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset for more detailsz'Lowercase all your data in column/key `r<   rn   NrJ   )r^   r   r   )r(   r2   r   count_uppercount_lowers    `   r   lower_case_validatorr     s    
 V*""#]^bbdKV*""#]^bbdKQ$;F8Chiohp  qh  iB6(!L"	
 	
 r   c                   SnSnSnSnSn[         R                  R                  U 5      (       Ga   U R                  5       R	                  S5      (       d$  U R                  5       R	                  S5      (       a`  U R                  5       R	                  S5      (       a  SOSu  pxSU S3nSU S	3n[
        R                  " X[        S
9R                  S5      nGOnU R                  5       R	                  S5      (       ad  SnSn[
        R                  " U 5      n	U	R                  n
[        U
5      S:  a  US-  n[
        R                  " U [        S9R                  S5      nGOU R                  5       R	                  S5      (       av  SnSn[        U S5       nUR                  5       n[
        R                  " UR!                  S5       Vs/ s H  nSU/PM	     snU[        S9R                  S5      nSSS5        GOLU R                  5       R	                  S5      (       af  [
        R"                  " U S[        S9R                  S5      n[        U5      S:X  a-  SnSn[
        R"                  " U [        S9R                  S5      nOOU R                  5       R	                  S5      (       ag   [
        R"                  " U S[        S9R                  S5      n[        U5      S:X  a)  [
        R"                  " U [        S9R                  S5      nO<SnSnO7SnS U ;   a  US!U  S"U R!                  S 5      S#    S$3-  nOUS!U  S%3-  nOS*U  S+3n[+        S,UUUS-9nXb4$ s  snf ! , (       d  f       N"= f! [$         a+    [
        R"                  " U [        S9R                  S5      n NYf = f! [$        [&        4 a1    U R!                  S 5      S#   R)                  5       nS&U  S'U S(U S)3n Nf = f).z
This function will read a file saved in .csv, .json, .txt, .xlsx or .tsv format using pandas.
 - for .xlsx it will read the first sheet
 - for .txt it will assume completions and split on newline
Nz.csvz.tsv)CSV,)TSV	z=
- Based on your file extension, your file is formatted as a z filezYour format `z` will be converted to `JSONL`)sepdtyper$   z.xlsxzH
- Based on your file extension, your file is formatted as an Excel filez/Your format `XLSX` will be converted to `JSONL`rx   z
- Your Excel file contains more than one sheet. Please either save as csv or ensure all data is present in the first sheet. WARNING: Reading only the first sheet...)r   z.txtz9
- Based on your file extension, you provided a text filez.Your format `TXT` will be converted to `JSONL`rr   )r.   r   .jsonlT)linesr   z^
- Your JSONL file appears to be in a JSON format. Your file will be converted to JSONL formatz/Your format `JSON` will be converted to `JSONL`z.jsonz^
- Your JSON file appears to be in a JSONL format. Your file will be converted to JSONL formatz]Your file must have one of the following extensions: .CSV, .TSV, .XLSX, .TXT, .JSON or .JSONLr   z Your file `z` ends with the extension `.z` which is not supported.z` is missing a file extension.zYour file `z!` does not appear to be in valid z9 format. Please ensure your file is formatted as a valid z file.zFile z does not exist.read_any_format)r   r   r   r   )ospathisfiler0   endswithpdread_csvr   fillna	ExcelFilesheet_namesr'   
read_excelopenread	DataFramesplit	read_json
ValueError	TypeErrorupperr   )fnamerF   remediationr   r   r   r(   file_extension_str	separatorxlssheetsfcontentlines                 r   r   r     s    KMMI	B	ww~~e<	v{{}%%f--1G1G1O1O@E@V@VW]@^@^dq-"TUgThhmn  #00B/CCa b[[SAHHL''00 k Qll5)v;?!  &N  NM]]54;;B?''// \ P%%ffhG07d0CD0C"d0CD &! fRj	  &% ''11\\%t3?FFrJr7a< %FM$UMe37>>rBB''00Ce4sCJJ2NB2w!|\\%s;BB2F )J(Y t  %<<w6RSXS^S^_bScdfSgRh  iB  "C  CI<w6T!UUI E7"23	##	K ?c E &%6 " Ce37>>rBBC I& 	v!&S!1"!5!;!;!=%eW,MN`Ma  b[  \n  [o  ou  vI	vs   B&N$ BN$ !4N$ 4M	M
M1N$ ;BN$ $N$ *A M, M, &N$ 7	N$ M
M)%N$ )N$ ,2N!N$  N!!N$ $>O%$O%c                H    [        U 5      nSnUS:X  a  SU S3n[        SUS9$ )z
This validator will infer the likely fine-tuning format of the data, and display it to the user if it is classification.
It will also suggest to use ada and explain train/validation split benefits.
Nr   zK
- Based on your data it seems like you're trying to fine-tune a model for z
- For classification, we recommend you try one of the faster and cheaper models, such as `ada`
- For classification, you can estimate the expected model performance by keeping a held out dataset, which is not used for trainingr%   r&   )r   r   )r(   r   r   s      r   format_inferrer_validatorr    sA    
 b!GM""fgnfo  pU  VN-HHr   c                h   UR                   bP  [        R                  R                  SUR                   SUR                    S35        [        R
                  " S5        UR                  b)  [        R                  R                  UR                  5        UR                  b  UR                  U 5      n U $ )zk
This function will apply a necessary remediation to a dataframe, or print an error message if one exists.
z

ERROR in z validator: z

Aborting...rx   )	r   r   stderrr   r   exitr   r   r   )r(   r  s     r   apply_necessary_remediationr  (  s     (

=)9)9(:,{G\G\F]]lmn  ,

223+%%b)Ir   c                    [         R                  R                  U 5        U(       a   [         R                  R                  S5        g[        5       R	                  5       S:g  $ )NzY
Tn)r   r   r   inputr0   )
input_textauto_accepts     r   accept_suggestionr#  6  s?    JJZ 

7==?c!!r   c                   SnSUR                    S3nUR                   b2  [        XB5      (       a"  UR                  c   eUR                  U 5      n SnUR                  b-  [        R
                  R                  SUR                   S35        X4$ )z[
This function will apply an optional remediation to a dataframe, based on the user input.
Fz- [Recommended] z [Y/n]: Tz- [Necessary] r   )r   r#  r   r   r   r   r   )r(   r  r"  optional_appliedr!  s        r   apply_optional_remediationr&  >  s     #K$<$<#=XFJ+Z55**666((,B#  ,

>+*C*C)DBGHr   c                    [        U 5      nSnUS:X  a  [        U 5      nUS-  nO"U R                  SS9R                  5       nUS-  nSS jnU" US-   5      n[        R
                  R                  S	U S
35        g)z7
Estimate the time it'll take to fine-tune the dataset
g      ?r   g
ףp=
?T)rb   g|?5^?c                    U S:  a  [        U S5       S3$ U S:  a  [        U S-  S5       S3$ U S:  a  [        U S-  S5       S3$ [        U S-  S5       S3$ )	N<   r
   z secondsi  z minutesiQ z hoursz days)round)times    r   format_time.estimate_fine_tuning_time.<locals>.format_time]  sv    "9D!n%X..D[D2Iq)*(33E\D4K+,F33D5L!,-U33r      z:Once your model starts training, it'll approximately take z~ to train a `curie` model, and less for `ada` and `babbage`. Queue will approximately take half an hour per job ahead of you.
N)r+  floatr>   r   )r   r'   memory_usager   r   r   r   )r(   	ft_formatexpected_timer%   sizer,  time_strings          r   estimate_fine_tuning_timer5  P  s      #IM$$2w$t+T*..0v4 mc12KJJ
D[M  RQ  	Rr   c                    U(       a  SS/OS/nSn US:  a  SU S3OSnU Vs/ s H-  n[         R                  R                  U 5      S    SU U S3PM/     nn[        S	 U 5       5      (       d  U$ US
-  nMh  s  snf )N_train_validr$   r   z ()	_preparedr   c              3  `   #    U  H$  n[         R                  R                  U5      v   M&     g 7fr8   )r   r   r   )r   r  s     r   r    get_outfnames.<locals>.<genexpr>s  s"     ?.>277>>!$$.>s   ,.rx   )r   r   splitextr_   )r  r
  suffixesiindex_suffixr   candidate_fnamess          r   get_outfnamesrB  m  s    ',(#2$H	A
$%EA3ayrowxowekrww//6q9:)F8L>Y_`owx?.>???##	Q xs   4A:c                    U R                   R                  5       nS nUS:X  a'  U R                   R                  5       R                  S   nX4$ )Nr
   r   )rA   r   value_countsrb   )r(   	n_classes	pos_classs      r   get_classification_hyperparamsrG  x  sF    %%'IIA~MM..066q9	r   c                V   [        U 5      n[        U R                  SS9n[        U R                  SS9nSnSnUS:X  a  [	        X5      (       a  SnSn	UR                  SS	5      n
UR                  SS	5      n[        U5      S
:  a  SU S3OSnSnU(       d?  U(       d8  [        R                  R                  SU SU	 SU
 SU S3	5        [        U 5        g[	        X5      (       Ga  [        X5      nU(       a  [        U5      S:X  a  SUS
   ;   a	  SUS   ;   d   eSn[        [        U 5      U-
  [        [        U 5      S-  5      5      nU R                  USS9nU R                  UR                   5      nUSS/   R#                  US
   SSSSS9  USS/   R#                  US   SSSSS9  [%        U 5      u  nnU	S-  n	US:X  a
  U	S U S3-  n	O5U	S!U 3-  n	O,[        U5      S:X  d   eU SS/   R#                  US
   SSSSS9  U(       a  S"OSS#-   S$R'                  U5      -   nU(       a	  S%US    S3OSn[        U
5      S
:X  a  SOS&U
 S3n[        R                  R                  S'U S(US
    SU U	 S)U U S35        [        U 5        g[        R                  R                  S*5        g)+aE  
This function will write out a dataframe to a file, if the user would like to proceed, and also offer a fine-tuning command with the newly created file.
For classification it will optionally ask the user if they would like to split the data into train/valid files, and modify the suggested command to include the valid set.
r   r   FzQ- [Recommended] Would you like to split into training and validation set? [Y/n]: r   Tr$   r   r   r   z Make sure to include `stop=["z;"]` so that the generated texts ends at the expected place.z@

Your data will be written to a new JSONL file. Proceed [Y/n]: zK
You can use your file for fine-tuning:
> openai api fine_tunes.create -t ""ue   

After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `zX` for the model to start generating completions, rather than continuing with the prompt.r
   trainvalidrx   i  g?*   )r  random_stater@   rA   recordsN)r   orientforce_asciiindentz! --compute_classification_metricsz" --classification_positive_class "z --classification_n_classes r\   z to `z` and `z -v "uc   After you’ve fine-tuned a model, remember that your prompt has to end with the indicator string `z
Wrote modified filezd`
Feel free to take a look!

Now use that file when fine-tuning:
> openai api fine_tunes.create -t "z

z#Aborting... did not write the file
)r   r   r@   rA   r#  r   r'   r   r   r   r5  rB  maxintsampler   rb   to_jsonrG  rp   )r(   r  any_remediationsr"  r1  common_prompt_suffixr   r
  r!  additional_params%common_prompt_suffix_new_line_handled)common_completion_suffix_new_line_handledoptional_ending_stringfnamesMAX_VALID_EXAMPLESn_traindf_traindf_validrE  rF  files_stringvalid_stringseparator_reminders                          r   write_out_filerd    s   
  #I*2998D.r}}8LEdJ$$Z55E,@,H,Hu,U)0H0P0PQUW\0]- 89A= ))R(S  TO  	P  VJE

[\a[bbcducv  w^  _D  ^E  E]  ^t  ]u  uw  x	
 	""%	:	3	3u,v;!#6!9(<FSTIAUUU!%#b'$66CGcM8JKGyy7y<Hwwx~~.Hh-.66q	iUSW 7  h-.66q	iUSW 7  $B"#E Iy!DDA~!'I)TU%VV!!'CI;%OO!v;!###,'(00q	iUSW 1 
  %"79>>&;QR/4vayk+" 89Q> v  x]  w^  ^v  w 	
 	

#L>  2Z  [a  bc  [d  Ze  ef  gs  ft  uF  tG  GK  L^  K_  `v  _w  wy  z	
 	""%

?@r   c                    Sn[        U R                  R                  R                  5       5      S:X  a  g[        U R                  R                  5       5      [        U 5      U-  :  a  gg)z6
Infer the likely fine-tuning task type from the data
   r   rt   r   zconditional generation)r   r@   r   r'   rA   unique)r(   CLASSIFICATION_THRESHOLDs     r   r   r     sU      !
299==1$&
2==!"SW/G%GG#r   c                    Sn US:X  a  U R                   [        U5      S-   * S OU R                   S[        U5      S-    nUR                  5       S:w  a   U$ X#R                  S   :X  a   U$ UR                  S   nMz  )zI
Finds the longest common suffix or prefix of all the values in a series
r$   r   rx   Nr   )r   r'   r   r   )seriesr   common_xfixcommon_xfixess       r   r   r     s     K
59X5EFJJ[)A-.016::VlX[\gXhklXlKm 	   "a'
 	 0033  (..q1K r   z,Callable[[pd.DataFrame], Remediation | None]r	   	Validatorc                     [         S S [        [        [        [        [
        S S [        [        [        [        [        /$ )Nc                    [        U S5      $ r   r?   rR   s    r   rS    get_validators.<locals>.<lambda>  s    ,Q9r   c                    [        U S5      $ r   rp  rR   s    r   rS   rq    s    ,Q=r   c                    [        U S5      $ r   r   rR   s    r   rS   rq    s    &q(3r   c                    [        U S5      $ r   rt  rR   s    r   rS   rq    s    &q,7r   )r+   rO   rf   r  rr   r   r   r   r   r   r   r   r   r   get_validatorsrv    s9    9=#!!!37&&**) r   c                h   / nUb  UR                  U5        U H,  nU" U 5      nUc  M  UR                  U5        [        X5      n M.     [        U Vs/ s H!  nUR                  c  UR                  c  M  UPM#     sn5      n[        U Vs/ s H  o"R                  c  M  UPM     sn5      n	Sn
U(       aB  [
        R                  R                  S5        U H  n[        XU5      u  pU
=(       d    Un
M     O[
        R                  R                  S5        U
=(       d    U	nU" XX5        g s  snf s  snf )NFz?

Based on the analysis we will perform the following actions:
z

No remediations found.
)	appendr  r_   r   r   r   r   r   r&  )r(   r  r  
validatorsr"  write_out_file_funcoptional_remediations	validator&any_optional_or_necessary_remediationsany_necessary_appliedany_optional_appliedr%  !any_optional_or_necessary_applieds                r   apply_validatorsr    s.    02$$[1	m"!((5,R=B	   .1  5	
4''3{7P7P 4	
.*  (=g(=AZAZ(=g !-

]^0K#=b{#[ B#7#K;K  1 	

78(<(U@U%#DR+	
 	hs   D*4D*
D/D/)r(   r=   r>   r   )r(   r=   r9   r   r>   r   )r(   r=   rF   	list[str]r>   r   )rA   )r(   r=   rY   r   r>   r   )r(   r=   r2   r   r>   Remediation | None)r  r   rF   r  r>   z'tuple[pd.DataFrame | None, Remediation])r(   r    r  r   r>   r    )r!  r   r"  boolr>   r  )r(   r=   r  r   r"  r  r>   ztuple[pd.DataFrame, bool])r(   r=   r>   None)r  r   r
  r  r>   r  )r(   r=   r>   ztuple[int, object])
r(   r=   r  r   rV  r  r"  r  r>   r  )r(   r=   r>   r   )r   )rj  r   r   r   r>   r   )r>   list[Validator])r(   r=   r  r   r  r  ry  r  r"  r  rz  zCallable[..., Any]r>   r  ),
__future__r   r   r   typingr   r   r   r   r   typing_extensionsr	   _extrasr   r  r   r    r+   r?   rO   rf   rr   r   r   r   r   r   r   r   r   r  r  r#  r&  r5  rB  rG  rd  r   r   rm  r   rv  r  r   r   r   <module>r     sx   " 	 
 ? ? ' "$* $ 19QR I!H HPQ]F^ <4 FN|D\ 2"JAH$N!HAH2. &.|$<VV!V,Vr	I"  #. =A  $: HAV$$ F	9 E('S'S'S $'S  	'S
 'S ,'S 
'Sr   