
G{,cu                 @   s   d  Z  d d l m Z d d l m Z d d l Z d d l m Z d d l m Z d d l	 m
 Z
 d d l Z d d	   Z Gd
 d   d e  Z d d   Z d d   Z d d   Z d S)a  
File "heatlib.py" by KWR (except as noted) for CSE199, Fall 2017.
Computes a "heat index" for a given webpage from a given dictionary of
"intense" words and their intensity values in several categories.
Tailored to the Canadian National Research Council Affect Intensity Lexicon
http://saifmohammad.com/WebPages/AffectIntensity.htm
by Saif Mohammad, used with his permission (and request not to redistribute).
However, works with any file of lines of the form

word    #.###   category

where NRCAIL's categories 'anger', 'fear', 'sadness', 'joy' are not treated
specially, so that clients can define any categories they wish.
Dictionary file can be local or loaded from a URL.
    )division)
HTMLParserN)sub)stderr)	print_excc             C   s   |  j    }  t j d d |   } t j d d |  } t j d d |  } t j d d |  } t j d d |  } t j d d |  } | j    S)Nz"(?is)<(script|style).*?>.*?(</\1>) z(?s)<!--(.*?)-->[\n]?z	(?s)<.*?> z&nbsp;z  )striprer   )ZhtmlstrZcleaned r   +/shared/web/faculty/regan/cse199/heatlib.py
clean_html!   s    r   c               @   sL   e  Z d  Z d d   Z d d   Z d d   Z d d   Z d	 d
   Z d S)_DeHTMLParserc             C   s   t  j |   g  |  _ d  S)N)r   __init___DeHTMLParser__text)selfr   r   r   r   9   s    z_DeHTMLParser.__init__c             C   sK   | j    } t |  d k rG t d d |  } |  j j | d  n  d  S)Nr   z[ 	
]+r   )r	   lenr   r   append)r   datatextr   r   r   handle_data=   s    z_DeHTMLParser.handle_datac             C   sB   | d k r |  j  j d  n | d k r> |  j  j d  n  d  S)Npz

br
)r   r   )r   tagattrsr   r   r   handle_starttagC   s    z_DeHTMLParser.handle_starttagc             C   s#   | d k r |  j  j d  n  d  S)Nr   z

)r   r   )r   r   r   r   r   r   handle_startendtagI   s    z _DeHTMLParser.handle_startendtagc             C   s   d j  |  j  j   S)Nr   )joinr   r	   )r   r   r   r   r   M   s    z_DeHTMLParser.textN)__name__
__module____qualname__r   r   r   r   r   r   r   r   r   r   8   s
   r   c          	   C   sN   y. t    } | j |   | j   | j   SWn t d t  |  SYn Xd  S)Nfile)r   Zfeedcloser   r   r   )r   Zparserr   r   r   dehtmlQ   s    	
r$   c             C   s  i  } t  j d  } |  j d  rQ t j j |   j   j d  j d  } n t	 |  d  } x | D] } | j
 |  } | rg | j d  } t | j d   } | j d  }	 |	 | k r t d	 |	 d
  d | |	 <n  | |	 f }
 | | |
 <qg qg W| | f S)Nz*^\s*([A-Za-z]+)\s+(\d\.\d+)\s+([a-z]+)\s*$Zhttpzutf-8r   r         zNew categoryzgiven multiplier 1.0g      ?)r
   compile
startswithurllibrequesturlopenreaddecodesplitopenmatchgroupfloatprint)locationmulDictheatDictZentryREsourcelineZ	lineMatchwordZvalcatentryr   r   r   loadDictc   s"    -r>   c             C   sX  d } d } d } d } d } x" | D] } | t  | |  7} q% Wi  }	 t j d  }
 t j d  } t j j |   } | j   j d  } t |  } | j	   } xr| D]j} t j
 d d |  } t j
 d d	 |  } t j
 d
 d	 |  } | j   } d	 } |
 j |  } | r| d 7} x | D] } | | f | k r*| d 7} | | k rk| d 7} | } n  | | | | | | f 7} | |	 k r| |	 | k r|	 | | d 7<qd |	 | | <qi  |	 | <d |	 | | <q*q*Wq | d	 k r | j |  r t d |  q q W| d k r1| | n d } d | | } | | | | |	 f S)Nr   g        z	^[-a-z]+$z^[-0-9A-Za-z]+$zutf-8z[-]_z[^\w]r   z[_]+r&   zUnusual wordg     @@)absr
   r)   r+   r,   r-   r.   r/   r   r0   r   lowerr2   r5   )urlr8   r7   	wordCountintenseCountintenseCountUniqueZscoreZ
normalizerkeycatDictZALPHALCHZ	ALPHANUMHZpageZhtmlStrZpageStrZ	pageArrayr;   ZwordlcZ
lastLookupZwordlcMatchr<   	heatIndexr   r   r   	heatScore   sT    


	
rI   )__doc__
__future__r   Zhtml.parserr   r
   r   sysr   	tracebackr   Zurllib.requestr+   r   r   r$   r>   rI   r   r   r   r   <module>   s   
+