'''
File "heatlib.py" by KWR (except as noted) for CSE199, Fall 2024 (originally 2017).
Computes a "heat index" for a given webpage from a given dictionary of
"intense" words and their intensity values in several categories.
Tailored to the Canadian National Research Council Affect Intensity Lexicon
(NCRAIL, see http://saifmohammad.com/WebPages/AffectIntensity.htm)
by Saif Mohammad, used with his permission (and request not to redistribute).
However, works with any file of lines of the form

word  category  #.###  (pre-2024 version has: word  #.###  category)

NRCAIL's original four categories 'anger', 'fear', 'sadness', 'joy' are not treated
specially, so that clients can define any categories they wish.
This is in fact done with the 'disgust' category.
Dictionary file can be local or loaded from a URL.

Python 3 Trinket seems fully functional with version 3 of the Python URL Library
(urllib3).  Commented-out lines in this file were used with earlier versions.
This handling was sometimes inconsistent as the Trinket web app evolved.
In case of glitches with the urllib3-based code, various commented-out lines
might be swapped in to revert to earlier working versions.
'''

from __future__ import division    #must be first line as of now
from html.parser import HTMLParser
import re
from re import sub
from sys import stderr
from traceback import print_exc
import urllib.request
import urllib3
#import requests.get
#import requests    #not on CSE machines, comment-in if using Python 3 Trinket
#import nltk  # not used---see next note

'''
The discontinued "clean_html(...)" function from nltk
From https://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented
with a few further edits by KWR
'''
def clean_html(htmlstr): #from old NLTK package, type str -> str

    htmlstr = htmlstr.strip()  # removes leading and trailing whitespace
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", htmlstr)
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    return cleaned.strip()


'''
By user "xperroni" at
https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/328969
Does not work as well...?
'''

class _DeHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.__text = []

    def handle_data(self, data):
        text = data.strip()
        if len(text) > 0:
            text = sub('[ \t\r\n]+', ' ', text)
            self.__text.append(text + ' ')

    def handle_starttag(self, tag, attrs):
        if tag == 'p':
            self.__text.append("\n\n")
        elif tag == 'br':
            self.__text.append("\n")

    def handle_startendtag(self, tag, attrs):
        if tag == 'br':
            self.__text.append("\n\n")

    def text(self):
        return ''.join(self.__text).strip()


def dehtml(text):
    try:
        parser = _DeHTMLParser()
        parser.feed(text)
        parser.close()
        return parser.text()
    except:
        print_exc(file=stderr)
        return text

'''
Compile dictionary at location with lines of form
word   val=#.###   category
into map entries
headDict(word,category) = val
The mulDict argument specifies known categories but any new categories
encountered are added to it (with default multiplier 1), so it too is returned
'''
def loadDict(location, mulDict):
   # location: URL if starts with http, else treated as local file name
   # mulDict: map from (category) to float, new cats given multiplier 1.0
   heatDict = {}  # map from (word,category) to floats, returned along with mulDict
   # entryRE = re.compile('^\s*([A-Za-z]+)\s+(\d\.\d+)\s+([a-z]+)\s*$')   # use with v0.5 file
   entryRE = re.compile('^\s*([A-Za-z]+)\s+([a-z]+)\s+(\d\.\d+)\s*$')
   if location.startswith('http'):
      #source = urllib.urlopen(location).read().decode('utf-8').split('\n')
      #source = urllib.request.urlopen(location).read().decode('utf-8').split('\n')
      mgr = urllib3.PoolManager()
      #source = mgr.request("GET",location).read().decode('utf-8').split('\n')
      source = mgr.request("GET",location).data.decode('utf-8').split('\n')
      #source = urllib.request.get(location).text.split('\n')   # Python 3 Trinket
   else:
      source = open(location, 'r')

   for line in source:
      lineMatch = entryRE.match(line)
      if lineMatch:
         word = lineMatch.group(1)
         #val = float(lineMatch.group(2))   #v0.5
         cat = lineMatch.group(2)
         #cat = lineMatch.group(3)   #v0.5
         val = float(lineMatch.group(3))
         if cat not in mulDict:
            print("New category",cat,"given multiplier 1.0")
            mulDict[cat] = 1.0
         entry = (word,cat)
         heatDict[entry] = val  # assume any repeated (word,cat) entry overrides the old
   return (heatDict, mulDict)


'''
Compute "heat score" of webpage at url using heatDict and category multipliers in mulDict.
Parameters have types string, (string x string) -> float, and string -> float.

Returns heatIndex, intenseCountUnique, intenseCount, wordCount, catDict
where heatIndex is a float whose sign and meaning depends on the multipliers
and the three counts are ints counting occurrences of intense words, occurrences
multiplied by the number of categories the word hits, and total body-text words read.
Finally catDict is a double-decker map category -> (word -> #timesWordOccurs)
which records the read words falling into each category.

Technotes: Uses 3rd-party tool to clean HTML frontmatter, endmatter, and markup from webpage.
Does not remove hyphens form words; converts underscores to hyphens.
If the dictionary removes hyphens this policy should be changed to match it.
Changing the tool and policies can affect the results.
'''

def heatScore(url, heatDict, mulDict):
   wordCount = 0
   intenseCount = 0
   intenseCountUnique = 0
   score = 0.0
   normalizer = 0.0
   for key in mulDict:
      normalizer += abs(mulDict[key])

   catDict = {}  # a double-level map cat -> (word -> numTimesItAppears)
   ALPHALCH = re.compile(r'^[-a-z]+$')          # initial r means "raw string" which
   ALPHANUMH = re.compile(r'^[-0-9A-Za-z]+$')   # defends against escaping special chars

   #page = urllib.urlopen(url)
   #page = urllib.request.urlopen(url)
   mgr = mgr = urllib3.PoolManager()
   page = mgr.request("GET",url)
   #page = requests.get(url)
   #htmlStr = page.read().decode('utf-8')
   htmlStr = page.data.decode('utf-8')
   #htmlStr = page.text
   pageStr = clean_html(htmlStr)       # use discontinued "nltk" routine, included above
   #pageStr = dehtml(htmlStr)          # use 4th-party script
   pageArray = pageStr.split()
   #print("I read:",pageStr)
   for word in pageArray:
      word = re.sub(r'[-]','_',word)   # change hyphens to underscores to save them
      word = re.sub(r'[^\w]','',word)  # one way to strip all non-"word" chars but not '_'
      word = re.sub(r'[_]+','',word)   # all underscores and old hyphens get stripped too.
      #word = re.sub(r'[_]+','-',word) # all underscores get changed to one hyphen
      wordlc = word.lower()
      lastLookup = ""
      wordlcMatch = ALPHALCH.match(wordlc)
      if wordlcMatch:
         wordCount += 1
         for cat in mulDict:
            if (wordlc,cat) in heatDict:
               intenseCount += 1
               if wordlc != lastLookup:
                  intenseCountUnique += 1
                  lastLookup = wordlc

               score += mulDict[cat] * heatDict[(wordlc,cat)]

               if cat in catDict:
                  if wordlc in catDict[cat]:
                     catDict[cat][wordlc] += 1
                  else:    # allocate new
                     catDict[cat][wordlc] = 1
               else:       # allocate new
                  catDict[cat] = {}  # an empty category map
                  catDict[cat][wordlc] = 1
            else:    # (word does not hit that category)
               pass  # most words will fall thru all categories
         # end for cat in multDict
      else: # a stripped word that doesn't match lowercase alpha + hyphen is worth noting
         if wordlc != "" and not ALPHANUMH.match(word):
            print("Unusual word",word)
   # end for

   heatIndex = score/wordCount if wordCount > 0 else 0.0
   heatIndex = 1000.0*heatIndex/normalizer  # the 1000.0 is arbitrary
   return heatIndex, intenseCountUnique, intenseCount, wordCount, catDict