
    gDN                       d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d      Z G d	 d
e      Z G d de      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% ed      	 	 	 	 	 	 d!d       Z& ed      	 d"	 	 	 	 	 	 	 d#d       Z'y )$    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                  :    e Zd ZdZddZddZd	dZed
d       Zy)MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    c                    t         )z@
        Determine if given character should be fed in.
        NotImplementedErrorself	characters     U/var/www/api/v1/venv_getwork_v1/lib/python3.12/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible&   
     "!    c                    t         )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r    s     r#   feedzMessDetectorPlugin.feed,   s
    
 "!r&   c                    t         )zB
        Permit to reset the plugin to the initial state.
        r   r!   s    r#   resetzMessDetectorPlugin.reset3   r%   r&   c                    t         )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r*   s    r#   ratiozMessDetectorPlugin.ratio9   s
     "!r&   Nr"   strreturnboolr"   r/   r0   Noner0   r3   r0   float)	__name__
__module____qualname____doc__r$   r(   r+   propertyr-    r&   r#   r   r       s*    
""" " "r&   r   c                  >    e Zd ZddZddZd	dZddZed
d       Zy) TooManySymbolOrPunctuationPluginc                J    d| _         d| _        d| _        d | _        d| _        y )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr*   s    r#   __init__z)TooManySymbolOrPunctuationPlugin.__init__C   s*    '("#%&04!,1#r&   c                "    |j                         S Nisprintabler    s     r#   r$   z)TooManySymbolOrPunctuationPlugin.eligibleK       $$&&r&   c                8   | xj                   dz  c_         || j                  k7  ro|t        vrgt        |      r| xj                  dz  c_        || _        y |j                         du r-t        |      r"t        |      du r| xj                  dz  c_        || _        y )Nr   F   )	rB   rC   r   r   r@   isdigitr   r   rA   r    s     r#   r(   z%TooManySymbolOrPunctuationPlugin.feedN   s    " 222!==i(''1,' %.! !!#u,i(	*e3""a'"$-!r&   c                .    d| _         d| _        d| _        y Nr   )r@   rB   rA   r*   s    r#   r+   z&TooManySymbolOrPunctuationPlugin.reset`   s    "# !r&   c                    | j                   dk(  ry| j                  | j                  z   | j                   z  }|dk\  r|S dS )Nr           333333?)rB   r@   rA   )r!   ratio_of_punctuations     r#   r-   z&TooManySymbolOrPunctuationPlugin.ratioe   sO      A% ##d&8&88!!'" (<s'B#KKr&   Nr4   r.   r2   r5   	r7   r8   r9   rE   r$   r(   r+   r;   r-   r<   r&   r#   r>   r>   B   s,    2'.$
 L Lr&   r>   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)TooManyAccentuatedPluginc                     d| _         d| _        y rO   rB   _accentuated_countr*   s    r#   rE   z!TooManyAccentuatedPlugin.__init__r   s    %&'(r&   c                "    |j                         S rG   )isalphar    s     r#   r$   z!TooManyAccentuatedPlugin.eligiblev   s      ""r&   c                p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y Nr   )rB   r
   rY   r    s     r#   r(   zTooManyAccentuatedPlugin.feedy   s1    ")$##q(# %r&   c                     d| _         d| _        y rO   rX   r*   s    r#   r+   zTooManyAccentuatedPlugin.reset   s     !"#r&   c                f    | j                   dk  ry| j                  | j                   z  }|dk\  r|S dS )N   rQ   gffffff?rX   )r!   ratio_of_accentuations     r#   r-   zTooManyAccentuatedPlugin.ratio   s=      1$'+'>'>AVAV'V(=(E$N3Nr&   Nr4   r.   r2   r5   rT   r<   r&   r#   rV   rV   q   s,    )#)$ O Or&   rV   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)UnprintablePluginc                     d| _         d| _        y rO   )_unprintable_countrB   r*   s    r#   rE   zUnprintablePlugin.__init__   s    '(%&r&   c                     yNTr<   r    s     r#   r$   zUnprintablePlugin.eligible       r&   c                n    t        |      r| xj                  dz  c_        | xj                  dz  c_        y r]   )r   re   rB   r    s     r#   r(   zUnprintablePlugin.feed   s,    )$##q(#"r&   c                    d| _         y rO   )re   r*   s    r#   r+   zUnprintablePlugin.reset   s
    "#r&   c                Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rQ   r`   )rB   re   r*   s    r#   r-   zUnprintablePlugin.ratio   s/      A%''!+t/D/DDDr&   Nr4   r.   r2   r5   rT   r<   r&   r#   rc   rc      s,    '#
$ E Er&   rc   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuspiciousDuplicateAccentPluginc                .    d| _         d| _        d | _        y rO   _successive_countrB   _last_latin_characterr*   s    r#   rE   z(SuspiciousDuplicateAccentPlugin.__init__   s    &'%&15"r&   c                <    |j                         xr t        |      S rG   )r[   r   r    s     r#   r$   z(SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r&   c                ~   | xj                   dz  c_         | j                  t        |      rt        | j                        ru|j                         r/| j                  j                         r| xj                  dz  c_        t        |      t        | j                        k(  r| xj                  dz  c_        || _        y r]   )rB   rq   r
   isupperrp   r   r    s     r#   r(   z$SuspiciousDuplicateAccentPlugin.feed   s    "&&2y)t99:  "t'A'A'I'I'K&&!+&Y'=9S9S+TT&&!+&%."r&   c                .    d| _         d| _        d | _        y rO   ro   r*   s    r#   r+   z%SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r&   c                Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rQ   rL   )rB   rp   r*   s    r#   r-   z%SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr&   Nr4   r.   r2   r5   rT   r<   r&   r#   rm   rm      s,    6;/*
 D Dr&   rm   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuspiciousRangec                .    d| _         d| _        d | _        y rO   )"_suspicious_successive_range_countrB   _last_printable_seenr*   s    r#   rE   zSuspiciousRange.__init__   s    78/%&04!r&   c                "    |j                         S rG   rH   r    s     r#   r$   zSuspiciousRange.eligible   rJ   r&   c                <   | xj                   dz  c_         |j                         st        |      s|t        v rd | _        y | j                  || _        y t        | j                        }t        |      }t        ||      r| xj                  dz  c_        || _        y r]   )rB   isspacer   r   r{   r    is_suspiciously_successive_rangerz   )r!   r"   unicode_range_aunicode_range_bs       r#   r(   zSuspiciousRange.feed   s    " i(88(,D%$$,(1D%&3D4M4M&N&3I&>+O_M33q83$-!r&   c                .    d| _         d| _        d | _        y rO   )rB   rz   r{   r*   s    r#   r+   zSuspiciousRange.reset   s     !23/$(!r&   c                ^    | j                   dk  ry| j                  dz  | j                   z  }|S )N   rQ   rL   )rB   rz   )r!   ratio_of_suspicious_range_usages     r#   r-   zSuspiciousRange.ratio   s<      B& 33a7!!2"' /.r&   Nr4   r.   r2   r5   rT   r<   r&   r#   rx   rx      s*    5
'..)
 / /r&   rx   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuperWeirdWordPluginc                    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        y )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrB   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr*   s    r#   rE   zSuperWeirdWordPlugin.__init__   sQ     !$%() */!). %&)*!)*!() r&   c                     yrg   r<   r    s     r#   r$   zSuperWeirdWordPlugin.eligible
  rh   r&   c                   |j                         r| xj                  |z  c_        t        |      r| xj                  dz  c_        | j                  du r`t        |      du st        |      rHt        |      du r;t        |      du r.t        |      du r!t        |      du rt        |      du rd| _        t        |      s,t        |      s!t        |      st        |      st        |      r| xj                  dz  c_        y | j                  sy |j                         st        |      st        |      r| j                  r| xj                  dz  c_        t!        | j                        }| xj"                  |z  c_        |dk\  r| j                  |z  dk\  rd| _        nt        | j                  d         rX| j                  d   j'                         r;t)        d | j                  D              du r| xj*                  dz  c_        d| _        n+| j                  dk(  rd| _        | xj*                  dz  c_        |dk\  r| j                  rwt-        | j                  t/        d	|            D cg c]  \  }}|j'                         r| }}}d}|rt!        |      |z  d
k  rd}|s| xj*                  dz  c_        d| _        | j$                  rD| xj0                  dz  c_        | xj2                  t!        | j                        z  c_        d| _        d| _        d| _        d	| _        d	| _        y |dvr<|j5                         du r)t7        |      rd| _        | xj                  |z  c_        y y y y c c}}w )Nr   FT   g      ?c              3  <   K   | ]  }|j                           y wrG   )rt   ).0_s     r#   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>7  s     >AAIIKs      r   rR   r   >   -<=>r   |~)r[   r   r
   r   r   r   r   r   r   r   r   r   r~   r   r   r   lenrB   r   rt   allr   zipranger   r   rM   r   )r!   r"   buffer_lengthcicamel_case_dstprobable_camel_caseds          r#   r(   zSuperWeirdWordPlugin.feed  s   LLI%Li())Q.)((E1i(E1^I5N9%.i(E1	*e3	*e3I&%/+/(y!Y'y)y)9%((A-(||>)#<Y@Wll!!$T\\!2M!!]2!!,,}<C04D- #4<<#34R(002>>>%G,,1,04D---204D-,,1,"t'?'? !$DLL%=2I J" J1yy{  J  "
 .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D$DL()D%'(D$@@!!#u,)$(,D%LLI%L % - A1"s   /M1c                t    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   Fr   )r   r   r   r   r   rB   r   r   r*   s    r#   r+   zSuperWeirdWordPlugin.reset^  sA    $)!#(   !$%!#$ r&   c                r    | j                   dk  r| j                  dk(  ry| j                  | j                  z  S )N
   r   rQ   )r   r   r   rB   r*   s    r#   r-   zSuperWeirdWordPlugin.ratioh  s7    r!d&>&>!&C((4+@+@@@r&   Nr4   r.   r2   r5   rT   r<   r&   r#   r   r      s.    *O&b% A Ar&   r   c                  B    e Zd ZdZddZd	dZd
dZddZedd       Z	y)CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    c                     d| _         d| _        y rO   _wrong_stop_count_cjk_character_countr*   s    r#   rE   zCjkInvalidStopPlugin.__init__v  s    &')*!r&   c                     yrg   r<   r    s     r#   r$   zCjkInvalidStopPlugin.eligiblez  rh   r&   c                z    |dv r| xj                   dz  c_         y t        |      r| xj                  dz  c_        y y )N>      丄   丅r   )r   r   r   r    s     r#   r(   zCjkInvalidStopPlugin.feed}  s<    &""a'")%%*% r&   c                     d| _         d| _        y rO   r   r*   s    r#   r+   zCjkInvalidStopPlugin.reset  s    !"$%!r&   c                T    | j                   dk  ry| j                  | j                   z  S )N   rQ   )r   r   r*   s    r#   r-   zCjkInvalidStopPlugin.ratio  s*    $$r)%%(A(AAAr&   Nr4   r.   r2   r5   )
r7   r8   r9   r:   rE   r$   r(   r+   r;   r-   r<   r&   r#   r   r   p  s1    
++& B Br&   r   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)ArchaicUpperLowerPluginc                f    d| _         d| _        d| _        d| _        d| _        d | _        d| _        y )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrB   _last_alpha_seen_current_ascii_onlyr*   s    r#   rE   z ArchaicUpperLowerPlugin.__init__  s9    	45,23*890%&,0)- r&   c                     yrg   r<   r    s     r#   r$   z ArchaicUpperLowerPlugin.eligible  rh   r&   c                Z   |j                         xr t        |      }|du }|r| j                  dkD  r| j                  dk  r?|j                         du r-| j                  du r| xj
                  | j                  z  c_        d| _        d| _        d | _        d| _        | xj                  dz  c_	        d| _        y | j                  du r|j                         du rd| _        | j                  |j                         r| j                  j                         s*|j                         rM| j                  j                         r3| j                  du r| xj                  dz  c_        d| _        nd| _        nd| _        | xj                  dz  c_	        | xj                  dz  c_        || _        y )NFr   @   r   TrL   )r[   r   r   rM   r   r   r   r   r   rB   isasciirt   islower)r!   r"   is_concerned	chunk_seps       r#   r(   zArchaicUpperLowerPlugin.feed  s    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+	0A0A0Cu0L',D$  ,!!#(=(=(E(E(G!!#(=(=(E(E(G99$66!;6 %DI $DI!	",,1, )r&   c                f    d| _         d| _        d| _        d| _        d | _        d| _        d| _        y )Nr   FT)rB   r   r   r   r   r   r   r*   s    r#   r+   zArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r&   c                T    | j                   dk(  ry| j                  | j                   z  S )Nr   rQ   )rB   r   r*   s    r#   r-   zArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr&   Nr4   r.   r2   r5   rT   r<   r&   r#   r   r     s-    .(*T( P Pr&   r   c                  >    e Zd ZddZddZddZd	dZed
d       Zy)ArabicIsolatedFormPluginc                     d| _         d| _        y rO   rB   _isolated_form_countr*   s    r#   rE   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!r&   c                     d| _         d| _        y rO   r   r*   s    r#   r+   zArabicIsolatedFormPlugin.reset  s     !$%!r&   c                    t        |      S rG   )r   r    s     r#   r$   z!ArabicIsolatedFormPlugin.eligible  s    ##r&   c                p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y r]   )rB   r   r   r    s     r#   r(   zArabicIsolatedFormPlugin.feed  s1    ""9-%%*% .r&   c                X    | j                   dk  ry| j                  | j                   z  }|S )Nr`   rQ   r   )r!   isolated_form_usages     r#   r-   zArabicIsolatedFormPlugin.ratio  s0      1$%)%>%>AVAV%V""r&   Nr4   r.   r2   r5   )	r7   r8   r9   rE   r+   r$   r(   r;   r-   r<   r&   r#   r   r     s*    +&$+ # #r&   r      )maxsizec                   | |y| |k(  ryd| v rd|v ryd| v sd|v ryd| v sd|v r	d| v sd|v ry| j                  d      |j                  d      }}|D ]  }|t        v r||v s y | dv |dv }}|s|r	d| v sd|v ry|r|ryd	| v sd	|v rd| v sd|v ry| d
k(  s|d
k(  ryd| v sd|v s| dv r!|dv rd| v sd|v ryd| v sd|v ry| d
k(  s|d
k(  ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r#   r   r     s    /"9/)/!g&@o%)G 	?"g&@&+*H 	c"c" '
 00!!	  	
	

 	33 ' 	, E_$<,?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cm+-/Or&   i   c           	        t         j                         D cg c]	  } |        }}t        |       dz   }d}|dk  rd}n
|dk  rd}nd}t        | dz   t	        |            D ]^  \  }}	|D ]%  }
|
j                  |      s|
j                  |       ' |	d	kD  r|	|z  d	k(  s	|	|dz
  k(  sFt        d
 |D              }||k\  s^ n |rt        d      }|j                  t        d| d| d|        t        |       dkD  r8|j                  t        d| dd         |j                  t        d| dd         |D ]1  }|j                  t        |j                   d|j                          3 t        |d      S c c}w )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    r   rQ   i       r   r      
r   c              3  4   K   | ]  }|j                     y wrG   )r-   )r   dts     r#   r   zmess_ratio.<locals>.<genexpr>`  s     !?Yr"((Ys   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r$   r(   sumr   logr   	__class__r-   round)decoded_sequencemaximum_thresholddebugmd_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr"   indexdetectorloggerr   s                r#   
mess_ratior  A  s    $6#D#D#F+#Fx
#F  + &'!+F O|13)	4,.),/) 04 7vG	5!H  +i( "
 AI%"CCqHfqj !!?Y!??O"33 H /0

11R0SSdetdu v!!2 35	
  2%JJu0@"0E/FGHJJu.>su.E-FGHBJJub
;<  !$$[+s   E6N)r   
str | Noner   r  r0   r1   )g?F)r   r/   r   r6   r   r1   r0   r6   )(
__future__r   	functoolsr   loggingr   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   rV   rc   rm   rx   r   r   r   r   r   r  r<   r&   r#   <module>r	     s;   "   
    *" "D,L'9 ,L^O1 O6E* E0"D&8 "DJ./( ./bsA- sAlB- B>IP0 IPX#1 #8 4FF2<F	F FR 4IN4%4%.34%BF4%
4% 4%r&   