
    ,h                     B    d dl Z d dlZd dlmZ ddlmZ  G d de      Zy)    N)BeautifulSoup   )
BaseParserc                   L    e Zd ZdZg dZg dZd Zd Zd Zd Z	ddZ
d	 Zd
 Zy)ParserzExtract text from html file using beautifulsoup4. Filter text to
    only show the visible parts of the page. Insipration from `here
    <http://stackoverflow.com/a/1983219/564709>`_.
    )	stylescriptz
[document]headtitlehtmlmetalinkbody)bbigismallttabbracronymcitecodedfnemkbdstrongsampvarabdobrimgmapobjectqr	   spansubsupbuttoninputlabelselecttextareac                     |j                   | j                  v ryt        j                  dt	        j
                  |j                                     ryy)zKUsed to filter text elements that have invisible text on the page.
        Fz	<!--.*-->T)name_disallowed_namesrematchsix	text_typeextractselfelements     ^/var/www/html/Resume-Scraper/venv/lib/python3.12/site-packages/textract/parsers/html_parser.py_visiblezParser._visible   s=     <<4111XXlCMM'//2C$DE    c                 6    |j                   | j                  v ryy)zoUsed to check whether given element can be treated as inline
        element (without new line after).
        TF)r/   _inline_tagsr6   s     r9   _inlinezParser._inline$   s     <<4,,,r;   c                     d}|St        j                  |      }t        j                  dd|      }t        j                  dd|      }|j	                         }|S )z6Looks for any possible text within given tag.
         z	(<[^>]+>)z\s )r3   r4   r1   r'   strip)r7   tagtexts      r9   _find_any_textzParser._find_any_text,   sP     ?==%D66,D1D66%d+D::<Dr;   c                    g }|j                  d      D ]+  }d|g i d}|j                  d      }t        |      dkD  s+|D ]  }g }|j                  d      |j                  d      z   }t        |      dkD  s7t        |      D ]v  \  }	}
| j                  |
      }t        |      }|	|d   v rt	        ||d   |	         |d   |	<   n||d   |	<   |j                  |t        |
j                  dd	            d
       x |d   j                  |        |d   D ]  }|dxx   |d   |   z  cc<    |j                  |       . |S )zzReturns array containing basic informations about tables for ASCII
        replacement (look: _replace_tables()).
        tabler   )widthrG   trs	col_widthtrthtdrJ   colspanr   )rD   rN   rI   rH   )find_alllen	enumeraterE   maxappendintget)r7   souptablestt_dictrI   rK   tr_dicttdsr   rM   td_textlengthcols                 r9   _parse_tableszParser._parse_tables7   st    w' 	&A 1RbIF**T"C3x!| 6B G++d+bkk$.??C3x!|%.s^ EAr&*&9&9"&=G%(\F F;$779<$*$*;$7$::"{ 3A 6
 :@{ 3A 6#NN(/+.rvvi/C+D,  u,,W5%6& "+. @C7Ovk':3'??O@f%3	&4 r;   c           	      X   | j                  |      }t        |      }|j                         }|D ]  }d}|d   }	d|t        |d         z  z   |d   z   }
||
z  dz   }||z  }|	D ]  }||z  }t        |      D ]x  \  }}|d   }|d   |   |z   }|d   dkD  rAt	        |d   dz
        D ]-  }|dz   }||z   t        |d         k  s||d   ||z      |z   z  }/ |d	t        |      z   d
z   ||z   z  z  }z |dz  } ||z  }|j                  d      }||_        |d   j                  |        |S )z=Replaces <table> elements with its ASCII equivalent.
        r@   rI   r   rJ   rH   
rD   rN   %sdivrG   )	r_   rP   lstriprQ   rangestrnew_tagstringreplace_with)r7   rV   v_separatorh_separatorrW   	v_sep_len
v_left_seprX   r   rI   h_length	head_footrK   r   rM   rD   rJ   j	new_tables                      r9   _replace_tableszParser._replace_tablesX   s    ##D)$	 '')
 	/ADE(CIAkN(;;<qzIH$x/47IID 
"&r] PEArf:D !+q 1I =I)}q(!&r)}Q!7 MA !AA !!s1[>':: )Q{^AaC-@9-L L	M S3y>1C7D;<NOODP  IDU+I#IgJ##I.+	/, r;   c                 z    |j                  d      }|D ]$  }| j                  |      s|j                          & |S )z>Unwraps inline elements defined in self._inline_tags.
        T)rO   r>   unwrap)r7   rV   elementselems       r9   _join_inlineszParser._join_inlinesv   s=     ==& 	D||D!	 r;   c                    t        |d      5 }t        |d      }d d d        | j                        }| j                  |      }d}|j	                  d      }t        | j                  |      D cg c]  }| }}|D ]K  }|j                  }	|	| j                  |      }	|	j                         }	t        |	      dkD  sA|d|	z   dz   z  }M |S # 1 sw Y   xY wc c}w )Nrblxmlr@   Tr   ra   )openr   rs   rx   rO   filterr:   ri   rE   rB   rP   )
r7   filenamekwargsstreamrV   r   rv   elrw   ri   s
             r9   r5   zParser.extract   s    (D! 	1V 0D	1 ##D) !!$' ==&!'x!@A2BAA 	-D[[F~,,T2\\^F6{Qv,,	- )	1 	1 Bs   C/	CCN)z | -)__name__
__module____qualname____doc__r0   r=   r:   r>   rE   r_   rs   rx   r5    r;   r9   r   r   	   s:    

L	B<r;   r   )r1   r3   bs4r   utilsr   r   r   r;   r9   <module>r      s    	 
  KZ Kr;   