
    ,h                     6    d dl Z d dlZddlmZ  G d de      Zy)    N   )ProbingStatec                   z    e Zd ZdZddZd Zed        Zd Zed        Z	d Z
ed	        Zed
        Zed        Zy)CharSetProbergffffff?Nc                 \    d | _         || _        t        j                  t              | _        y N)_statelang_filterlogging	getLogger__name__logger)selfr
   s     W/var/www/html/Resume-Scraper/venv/lib/python3.12/site-packages/chardet/charsetprober.py__init__zCharSetProber.__init__'   s#    &''1    c                 .    t         j                  | _        y r   )r   	DETECTINGr	   r   s    r   resetzCharSetProber.reset,   s    ",,r   c                      y r    r   s    r   charset_namezCharSetProber.charset_name/   s    r   c                      y r   r   )r   bufs     r   feedzCharSetProber.feed3   s    r   c                     | j                   S r   )r	   r   s    r   statezCharSetProber.state6   s    {{r   c                      y)Ng        r   r   s    r   get_confidencezCharSetProber.get_confidence:   s    r   c                 4    t        j                  dd|       } | S )Ns   ([ -])+    )resub)r   s    r   filter_high_byte_onlyz#CharSetProber.filter_high_byte_only=   s    ff&c2
r   c                     t               }t        j                  d|       }|D ]C  }|j                  |dd        |dd }|j	                         s|dk  rd}|j                  |       E |S )u9  
        We define three types of bytes:
        alphabet: english alphabets [a-zA-Z]
        international: international characters [-ÿ]
        marker: everything else [^a-zA-Z-ÿ]

        The input buffer can be thought to contain a series of words delimited
        by markers. This function works to filter all words that contain at
        least one international character. All contiguous sequences of markers
        are replaced by a single space ascii character.

        This filter applies to all scripts which do not use English characters.
        s%   [a-zA-Z]*[-]+[a-zA-Z]*[^a-zA-Z-]?N   r"   )	bytearrayr#   findallextendisalpha)r   filteredwordsword	last_chars        r   filter_international_wordsz(CharSetProber.filter_international_wordsB   s|     ;
 

O   
	'DOOD"I& RS	I$$&9w+> 	OOI&
	' r   c                 >   t               }d}d}t        t        |             D ]a  }| ||dz    }|dk(  rd}n|dk(  rd}|dk  s |j                         r1||kD  r'|s%|j	                  | ||        |j	                  d       |dz   }c |s|j	                  | |d	        |S )
a  
        Returns a copy of ``buf`` that retains only the sequences of English
        alphabet and high byte characters that are not between <> characters.
        Also retains English alphabet and high byte characters immediately
        before occurrences of >.

        This filter can be applied to all scripts which contain both English
        characters and extended ASCII characters, but is currently only used by
        ``Latin1Prober``.
        Fr   r      >   <Tr(   r"   N)r)   rangelenr,   r+   )r   r-   in_tagprevcurrbuf_chars         r   filter_with_english_lettersz)CharSetProber.filter_with_english_lettersg   s     ;#c(O 	 D4q)H4T! '!(*:*:*<$;v OOCTN3OOD)ax%	 *  OOCJ'r   r   )r   
__module____qualname__SHORTCUT_THRESHOLDr   r   propertyr   r   r   r    staticmethodr%   r1   r;   r   r   r   r   r   #   s    2
-       " "H ) )r   r   )r   r#   enumsr   objectr   r   r   r   <module>rC      s   :  	 nF nr   