
    ,h!                     |   d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ 	 eZ	 ddlmZ 	 ddlmZ  G d	 d
e      Z	 ddlmZ  G d de      Z e       Zd ZddZ	 	 ddZ	 	 ddZddZ ddZ!d Z" e       Z#y# e$ r eefZY cw xY w# e$ r	 ddlmZ Y mw xY w# e$ r	 ddlmZ Y ww xY w# e$ r Y ^w xY w)z?
An interface to html5lib that mimics the lxml.html interface.
    N)
HTMLParser)TreeBuilder)etree)ElementXHTML_NAMESPACE_contains_block_level_tag)urlopen)urlparsec                       e Zd ZdZddZy)r   z*An html5lib HTML parser with lxml as tree.c                 >    t        j                  | f|t        d| y N)stricttree)_HTMLParser__init__r   selfr   kwargss      W/var/www/html/Resume-Scraper/venv/lib/python3.12/site-packages/lxml/html/html5parser.pyr   zHTMLParser.__init__   s    TM&{MfM    NF__name__
__module____qualname____doc__r    r   r   r   r      s    4Nr   r   )XHTMLParserc                       e Zd ZdZddZy)r   z+An html5lib XHTML Parser with lxml as tree.c                 >    t        j                  | f|t        d| y r   )_XHTMLParserr   r   r   s      r   r   zXHTMLParser.__init__*   s    !!$RvKR6Rr   Nr   r   r   r   r   r   r   '   s    9	Sr   r   c                 b    | j                  |      }||S | j                  dt        d|      S )N{})findr   )r   tagelems      r   	_find_tagr(   0   s.    99S>D99#677r   c                     t        | t              st        d      |t        }i }|t        | t              rd}|||d<    |j
                  | fi |j                         S )z
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    string requiredT
useChardet)
isinstance_strings	TypeErrorhtml_parserbytesparsegetroot)htmlguess_charsetparseroptionss       r   document_fromstringr7   7   sn     dH%)**~GD%!8   -6<<((0022r   c                 >   t        | t              st        d      |t        }i }|t        | t              rd}|||d<    |j
                  | dfi |}|rFt        |d   t              r3|r1|d   j                         rt        j                  d|d   z        |d= |S )a`  Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r*   Fr+   divr   zThere is leading text: %r)	r,   r-   r.   r/   r0   parseFragmentstripr   ParserError)r3   no_leading_textr4   r5   r6   childrens         r   fragments_fromstringr?   O   s     dH%)**~GD%!8   -#v##D%;7;HJx{H5{  "''(C(0)4 5 5Or   c                 6   t        | t              st        d      t        |      }t	        | |||       }|rRt        |t              sd}t        |      }|r1t        |d   t              r|d   |_        |d= |j                  |       |S |st        j                  d      t        |      dkD  rt        j                  d      |d   }|j                  r<|j                  j                         r"t        j                  d|j                  z        d	|_        |S )
a  Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r*   )r4   r5   r=   r9   r   zNo elements found   zMultiple elements foundzElement followed by text: %rN)r,   r-   r.   boolr?   r   textextendr   r<   lentailr;   )r3   create_parentr4   r5   accept_leading_textelementsnew_rootresults           r   fragment_fromstringrL   q   s    dH%)**}-#M&//1H -2!M=)(1+x0 (QKOOH% 344
8}q 9::a[F{{v{{((* > LMMFKMr   c                 t   t        | t              st        d      t        | ||      }| dd }t        |t              r|j                  dd      }|j                         j                         }|j                  d      s|j                  d      r|S t        |d	      }t        |      r|S t        |d
      }t        |      dk(  rW|j                  r|j                  j                         s1|d   j                  r|d   j                  j                         s|d   S t        |      r	d|_        |S d|_        |S )a  Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    r*   )r5   r4   N2   asciireplacez<htmlz	<!doctypeheadbodyrA   r   r9   span)r,   r-   r.   r7   r0   decodelstriplower
startswithr(   rE   rC   r;   rF   r   r&   )r3   r4   r5   docstartrQ   rR   s          r   
fromstringr[      s    dH%)**
d6,9;C "IE% Wi0LLN  "E E$4$4[$A
S&!D 4y
S&!D 	D	Q		1Bbd2hmm&9&9&;Aw
 !& K Kr   c                     |t         }t        | t              s| }|.d}n+t        |       rt	        |       }|d}nt        | d      }|d}i }|r||d<    |j                  |fi |S )a*  Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    FTrbr+   )r/   r,   r-   _looks_like_urlr	   openr1   )filename_url_or_filer4   r5   fpr6   s        r   r1   r1      s     ~*H5! !M	-	.)*  M&-  MG  -6<<&g&&r   c                     t        |       d   }|syt        j                  dk(  r!|t        j                  v rt        |      dk(  ryy)Nr   Fwin32rA   T)r
   sysplatformstringascii_lettersrE   )strschemes     r   r^   r^      sB    c]1F
,,'
!f***Fq r   )NN)FNN)$r   rd   rf   html5libr   r    html5lib.treebuilders.etree_lxmlr   lxmlr   	lxml.htmlr   r   r   
basestringr-   	NameErrorr0   rh   urllib2r	   ImportErrorurllib.requestr
   urllib.parser   r!   xhtml_parserr(   r7   r?   rL   r[   r1   r^   r/   r   r   r   <module>ru      s      . 8  I IH'&!
N N!4Sl S =L830 0548D -237)X3l!'H
 lk  s|H  '&'  &%&  		sE   B B B" B3 	BBBB"B0/B03B;:B;