
    ,hN                       d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ  G d d      Z G d	 d
e      Z G d de      Z G d de      Z  G d de      Z! G d de      Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& ed      	 	 	 	 	 	 d!d       Z' ed      	 d"	 	 	 	 	 	 	 d#d       Z(y )$    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangeis_cjk_uncommonc                  :    e Zd ZdZddZddZd	dZed
d       Zy)MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    c                    t         )z@
        Determine if given character should be fed in.
        NotImplementedErrorself	characters     W/var/www/html/Resume-Scraper/venv/lib/python3.12/site-packages/charset_normalizer/md.pyeligiblezMessDetectorPlugin.eligible'   
     "!    c                    t         )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r!   s     r$   feedzMessDetectorPlugin.feed-   s
    
 "!r'   c                    t         )zB
        Permit to reset the plugin to the initial state.
        r   r"   s    r$   resetzMessDetectorPlugin.reset4   r&   r'   c                    t         )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r+   s    r$   ratiozMessDetectorPlugin.ratio:   s
     "!r'   Nr#   strreturnboolr#   r0   r1   Noner1   r4   r1   float)	__name__
__module____qualname____doc__r%   r)   r,   propertyr.    r'   r$   r   r   !   s*    
""" " "r'   r   c                  >    e Zd ZddZddZd	dZddZed
d       Zy) TooManySymbolOrPunctuationPluginc                J    d| _         d| _        d| _        d | _        d| _        y )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr+   s    r$   __init__z)TooManySymbolOrPunctuationPlugin.__init__D   s*    '("#%&04!,1#r'   c                "    |j                         S Nisprintabler!   s     r$   r%   z)TooManySymbolOrPunctuationPlugin.eligibleL       $$&&r'   c                8   | xj                   dz  c_         || j                  k7  ro|t        vrgt        |      r| xj                  dz  c_        || _        y |j                         du r-t        |      r"t        |      du r| xj                  dz  c_        || _        y )Nr   F   )	rC   rD   r   r   rA   isdigitr   r   rB   r!   s     r$   r)   z%TooManySymbolOrPunctuationPlugin.feedO   s    " 222!==i(''1,' %.! !!#u,i(	*e3""a'"$-!r'   c                .    d| _         d| _        d| _        y Nr   )rA   rC   rB   r+   s    r$   r,   z&TooManySymbolOrPunctuationPlugin.reseta   s    "# !r'   c                    | j                   dk(  ry| j                  | j                  z   | j                   z  }|dk\  r|S dS )Nr           333333?)rC   rA   rB   )r"   ratio_of_punctuations     r$   r.   z&TooManySymbolOrPunctuationPlugin.ratiof   sO      A% ##d&8&88!!'" (<s'B#KKr'   Nr5   r/   r3   r6   	r8   r9   r:   rF   r%   r)   r,   r<   r.   r=   r'   r$   r?   r?   C   s,    2'.$
 L Lr'   r?   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)TooManyAccentuatedPluginc                     d| _         d| _        y rP   rC   _accentuated_countr+   s    r$   rF   z!TooManyAccentuatedPlugin.__init__s   s    %&'(r'   c                "    |j                         S rH   )isalphar!   s     r$   r%   z!TooManyAccentuatedPlugin.eligiblew   s      ""r'   c                p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y Nr   )rC   r
   rZ   r!   s     r$   r)   zTooManyAccentuatedPlugin.feedz   s1    ")$##q(# %r'   c                     d| _         d| _        y rP   rY   r+   s    r$   r,   zTooManyAccentuatedPlugin.reset   s     !"#r'   c                f    | j                   dk  ry| j                  | j                   z  }|dk\  r|S dS )N   rR   gffffff?rY   )r"   ratio_of_accentuations     r$   r.   zTooManyAccentuatedPlugin.ratio   s=      1$'+'>'>AVAV'V(=(E$N3Nr'   Nr5   r/   r3   r6   rU   r=   r'   r$   rW   rW   r   s,    )#)$ O Or'   rW   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)UnprintablePluginc                     d| _         d| _        y rP   )_unprintable_countrC   r+   s    r$   rF   zUnprintablePlugin.__init__   s    '(%&r'   c                     yNTr=   r!   s     r$   r%   zUnprintablePlugin.eligible       r'   c                n    t        |      r| xj                  dz  c_        | xj                  dz  c_        y r^   )r   rf   rC   r!   s     r$   r)   zUnprintablePlugin.feed   s,    )$##q(#"r'   c                    d| _         y rP   )rf   r+   s    r$   r,   zUnprintablePlugin.reset   s
    "#r'   c                Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rR   ra   )rC   rf   r+   s    r$   r.   zUnprintablePlugin.ratio   s/      A%''!+t/D/DDDr'   Nr5   r/   r3   r6   rU   r=   r'   r$   rd   rd      s,    '#
$ E Er'   rd   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuspiciousDuplicateAccentPluginc                .    d| _         d| _        d | _        y rP   _successive_countrC   _last_latin_characterr+   s    r$   rF   z(SuspiciousDuplicateAccentPlugin.__init__   s    &'%&15"r'   c                <    |j                         xr t        |      S rH   )r\   r   r!   s     r$   r%   z(SuspiciousDuplicateAccentPlugin.eligible   s      ":x	'::r'   c                ~   | xj                   dz  c_         | j                  t        |      rt        | j                        ru|j                         r/| j                  j                         r| xj                  dz  c_        t        |      t        | j                        k(  r| xj                  dz  c_        || _        y r^   )rC   rr   r
   isupperrq   r   r!   s     r$   r)   z$SuspiciousDuplicateAccentPlugin.feed   s    "&&2y)t99:  "t'A'A'I'I'K&&!+&Y'=9S9S+TT&&!+&%."r'   c                .    d| _         d| _        d | _        y rP   rp   r+   s    r$   r,   z%SuspiciousDuplicateAccentPlugin.reset   s    !" !%)"r'   c                Z    | j                   dk(  ry| j                  dz  | j                   z  S )Nr   rR   rM   )rC   rq   r+   s    r$   r.   z%SuspiciousDuplicateAccentPlugin.ratio   s/      A%&&*d.C.CCCr'   Nr5   r/   r3   r6   rU   r=   r'   r$   rn   rn      s,    6;/*
 D Dr'   rn   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuspiciousRangec                .    d| _         d| _        d | _        y rP   )"_suspicious_successive_range_countrC   _last_printable_seenr+   s    r$   rF   zSuspiciousRange.__init__   s    78/%&04!r'   c                "    |j                         S rH   rI   r!   s     r$   r%   zSuspiciousRange.eligible   rK   r'   c                <   | xj                   dz  c_         |j                         st        |      s|t        v rd | _        y | j                  || _        y t        | j                        }t        |      }t        ||      r| xj                  dz  c_        || _        y r^   )rC   isspacer   r   r|   r    is_suspiciously_successive_ranger{   )r"   r#   unicode_range_aunicode_range_bs       r$   r)   zSuspiciousRange.feed   s    " i(88(,D%$$,(1D%&3D4M4M&N&3I&>+O_M33q83$-!r'   c                .    d| _         d| _        d | _        y rP   )rC   r{   r|   r+   s    r$   r,   zSuspiciousRange.reset   s     !23/$(!r'   c                ^    | j                   dk  ry| j                  dz  | j                   z  }|S )N   rR   rM   )rC   r{   )r"   ratio_of_suspicious_range_usages     r$   r.   zSuspiciousRange.ratio   s<      B& 33a7!!2"' /.r'   Nr5   r/   r3   r6   rU   r=   r'   r$   ry   ry      s*    5
'..)
 / /r'   ry   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)SuperWeirdWordPluginc                    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _        d| _	        y )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchrC   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr+   s    r$   rF   zSuperWeirdWordPlugin.__init__   sQ     !$%() */!). %&)*!)*!() r'   c                     yrh   r=   r!   s     r$   r%   zSuperWeirdWordPlugin.eligible  ri   r'   c                   |j                         r| xj                  |z  c_        t        |      r| xj                  dz  c_        | j                  du r`t        |      du st        |      rHt        |      du r;t        |      du r.t        |      du r!t        |      du rt        |      du rd| _        t        |      s,t        |      s!t        |      st        |      st        |      r| xj                  dz  c_        y | j                  sy |j                         st        |      st        |      r| j                  r| xj                  dz  c_        t!        | j                        }| xj"                  |z  c_        |dk\  r| j                  |z  dk\  rd| _        nt        | j                  d         rX| j                  d   j'                         r;t)        d | j                  D              du r| xj*                  dz  c_        d| _        n+| j                  dk(  rd| _        | xj*                  dz  c_        |dk\  r| j                  rwt-        | j                  t/        d	|            D cg c]  \  }}|j'                         r| }}}d}|rt!        |      |z  d
k  rd}|s| xj*                  dz  c_        d| _        | j$                  rD| xj0                  dz  c_        | xj2                  t!        | j                        z  c_        d| _        d| _        d| _        d	| _        d	| _        y |dvr<|j5                         du r)t7        |      rd| _        | xj                  |z  c_        y y y y c c}}w )Nr   FT         ?c              3  <   K   | ]  }|j                           y wrH   )ru   ).0_s     r$   	<genexpr>z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>8  s     >AAIIK>s      r   rS   r   >   r   -<=>|~)r\   r   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   lenrC   r   ru   allr   zipranger   r   rN   r   )r"   r#   buffer_lengthcicamel_case_dstprobable_camel_caseds          r$   r)   zSuperWeirdWordPlugin.feed  s   LLI%Li())Q.)((E1i(E1^I5N9%.i(E1	*e3	*e3I&%/+/(y!Y'y)y)9%((A-(||>)#<Y@Wll!!$T\\!2M!!]2!!,,}<C04D- #4<<#34R(002>>>%G,,1,04D---204D-,,1,"t'?'? !$DLL%=2I J"1yy{ " "
 .3$!s>':]'Jc'Q+/(+,,1,04D-(($$)$))S->>),1)',D$DL()D%'(D$@@!!#u,)$(,D%LLI%L % - A1"s   /M1c                t    d| _         d| _        d| _        d| _        d| _        d| _        d| _        d| _        y )Nr   Fr   )r   r   r   r   r   rC   r   r   r+   s    r$   r,   zSuperWeirdWordPlugin.reset_  sA    $)!#(   !$%!#$ r'   c                r    | j                   dk  r| j                  dk(  ry| j                  | j                  z  S )N
   r   rR   )r   r   r   rC   r+   s    r$   r.   zSuperWeirdWordPlugin.ratioi  s7    r!d&>&>!&C((4+@+@@@r'   Nr5   r/   r3   r6   rU   r=   r'   r$   r   r      s.    *O&b% A Ar'   r   c                  B    e Zd ZdZddZd	dZd
dZddZedd       Z	y)CjkUncommonPluginz<
    Detect messy CJK text that probably means nothing.
    c                     d| _         d| _        y rP   rC   _uncommon_countr+   s    r$   rF   zCjkUncommonPlugin.__init__v  s    %&$%r'   c                    t        |      S rH   )r   r!   s     r$   r%   zCjkUncommonPlugin.eligiblez  s    i  r'   c                p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y r^   )rC   r   r   r!   s     r$   r)   zCjkUncommonPlugin.feed}  s4    "9%  A%  &r'   c                     d| _         d| _        y rP   r   r+   s    r$   r,   zCjkUncommonPlugin.reset  s     ! r'   c                l    | j                   dk  ry| j                  | j                   z  }|dkD  r|dz  S dS )Nra   rR   r   r   r   )r"   uncommon_form_usages     r$   r.   zCjkUncommonPlugin.ratio  sD      1$%)%9%9D<Q<Q%Q ,?+D"R'M#Mr'   Nr5   r/   r3   r6   )
r8   r9   r:   r;   rF   r%   r)   r,   r<   r.   r=   r'   r$   r   r   q  s1    &!! N Nr'   r   c                  >    e Zd ZddZddZd	dZddZed
d       Zy)ArchaicUpperLowerPluginc                f    d| _         d| _        d| _        d| _        d| _        d | _        d| _        y )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalrC   _last_alpha_seen_current_ascii_onlyr+   s    r$   rF   z ArchaicUpperLowerPlugin.__init__  s9    	45,23*890%&,0)- r'   c                     yrh   r=   r!   s     r$   r%   z ArchaicUpperLowerPlugin.eligible  ri   r'   c                Z   |j                         xr t        |      }|du }|r| j                  dkD  r| j                  dk  r?|j                         du r-| j                  du r| xj
                  | j                  z  c_        d| _        d| _        d | _        d| _        | xj                  dz  c_	        d| _        y | j                  du r|j                         du rd| _        | j                  |j                         r| j                  j                         s*|j                         rM| j                  j                         r3| j                  du r| xj                  dz  c_        d| _        nd| _        nd| _        | xj                  dz  c_	        | xj                  dz  c_        || _        y )NFr   @   r   TrM   )r\   r   r   rN   r   r   r   r   r   rC   isasciiru   islower)r"   r#   is_concerned	chunk_seps       r$   r)   zArchaicUpperLowerPlugin.feed  s    ((*J/?	/J E)	==A44:%%'50,,588668 23D.34D0$(D!DI!!Q&!'+D$##t+	0A0A0Cu0L',D$  ,!!#(=(=(E(E(G!!#(=(=(E(E(G99$66!;6 %DI $DI!	",,1, )r'   c                f    d| _         d| _        d| _        d| _        d | _        d| _        d| _        y )Nr   FT)rC   r   r   r   r   r   r   r+   s    r$   r,   zArchaicUpperLowerPlugin.reset  s9     !/0,-.*340 $	#' r'   c                T    | j                   dk(  ry| j                  | j                   z  S )Nr   rR   )rC   r   r+   s    r$   r.   zArchaicUpperLowerPlugin.ratio  s*      A%77$:O:OOOr'   Nr5   r/   r3   r6   rU   r=   r'   r$   r   r     s-    .(*T( P Pr'   r   c                  >    e Zd ZddZddZddZd	dZed
d       Zy)ArabicIsolatedFormPluginc                     d| _         d| _        y rP   rC   _isolated_form_countr+   s    r$   rF   z!ArabicIsolatedFormPlugin.__init__  s    %&)*!r'   c                     d| _         d| _        y rP   r   r+   s    r$   r,   zArabicIsolatedFormPlugin.reset  s     !$%!r'   c                    t        |      S rH   )r   r!   s     r$   r%   z!ArabicIsolatedFormPlugin.eligible  s    ##r'   c                p    | xj                   dz  c_         t        |      r| xj                  dz  c_        y y r^   )rC   r   r   r!   s     r$   r)   zArabicIsolatedFormPlugin.feed  s1    ""9-%%*% .r'   c                X    | j                   dk  ry| j                  | j                   z  }|S )Nra   rR   r   )r"   isolated_form_usages     r$   r.   zArabicIsolatedFormPlugin.ratio  s0      1$%)%>%>AVAV%V""r'   Nr5   r/   r3   r6   )	r8   r9   r:   rF   r,   r%   r)   r<   r.   r=   r'   r$   r   r     s*    +&$+ # #r'   r      )maxsizec                   | |y| |k(  ryd| v rd|v ryd| v sd|v ryd| v sd|v r	d| v sd|v ry| j                  d      |j                  d      }}|D ]  }|t        v r||v s y | dv |dv }}|s|r	d| v sd|v ry|r|ryd	| v sd	|v rd| v sd|v ry| d
k(  s|d
k(  ryd| v sd|v s| dv r!|dv rd| v sd|v ryd| v sd|v ry| d
k(  s|d
k(  ryy)za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    TFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )r   r   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charss          r$   r   r     s    /"9/)/!g&@o%)G 	?"g&@&+*H 	c"c" '
  00!!	 	
	

 	33 ' 	, E_$<,?"h/&AO#u'?m+-/O 	 E_$<3377O+}/Oo%O)Cm+-/Or'   i   c           	        t         j                         D cg c]	  } |        }}t        |       dz   }d}|dk  rd}n
|dk  rd}nd}t        | dz   t	        |            D ]^  \  }}	|D ]%  }
|
j                  |      s|
j                  |       ' |	d	kD  r|	|z  d	k(  s	|	|dz
  k(  sFt        d
 |D              }||k\  s^ n |rt        d      }|j                  t        d| d| d|        t        |       dkD  r8|j                  t        d| dd         |j                  t        d| dd         |D ]1  }|j                  t        |j                   d|j                          3 t        |d      S c c}w )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    r   rR   i       r   r      
r   c              3  4   K   | ]  }|j                     y wrH   )r.   )r   dts     r$   r   zmess_ratio.<locals>.<genexpr>e  s     !?r"((!?s   charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=   zStarting with: NzEnding with: iz:    )r   __subclasses__r   r   r   r%   r)   sumr   logr   	__class__r.   round)decoded_sequencemaximum_thresholddebugmd_class	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr#   indexdetectorloggerr   s                r$   
mess_ratior  F  s    $6#D#D#F+
+I + &'!+F O|13)	4,.),/) 04 7vG 	5! 	)H  +i(	)
 AI%"CCqHfqj !!?Y!??O"33 /0

11R0SSdetdu v!!2 35	
  2%JJu0@"0E/FGHJJu.>su.E-FGH 	=BJJub
;<	= !$$[+s   E6N)r   
str | Noner   r  r1   r2   )g?F)r   r0   r   r7   r   r2   r1   r7   ))
__future__r   	functoolsr   loggingr   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r?   rW   rd   rn   ry   r   r   r   r   r   r  r=   r'   r$   <module>r	     s<   "   
    ," "D,L'9 ,L^O1 O6E* E0"D&8 "DJ./( ./bsA- sAl N*  NFIP0 IPX#1 #8 4FF2<F	F FR 4IN4%4%.34%BF4%
4% 4%r'   