''' File "heatlib.py" by KWR (except as noted) for CSE199, Fall 2017. Computes a "heat index" for a given webpage from a given dictionary of "intense" words and their intensity values in several categories. Tailored to the Canadian National Research Council Affect Intensity Lexicon http://saifmohammad.com/WebPages/AffectIntensity.htm by Saif Mohammad, used with his permission (and request not to redistribute). However, works with any file of lines of the form word #.### category where NRCAIL's categories 'anger', 'fear', 'sadness', 'joy' are not treated specially, so that clients can define any categories they wish. Dictionary file can be local or loaded from a URL. ''' from __future__ import division #must be first line as of now from html.parser import HTMLParser import re from re import sub from sys import stderr from traceback import print_exc import urllib.request #import requests.get #import requests #not on CSE machines, comment-in if using Python 3 Trinket #import nltk # not used---see next note ''' The discontinued "clean_html(...)" function from nltk From https://stackoverflow.com/questions/26002076/python-nltk-clean-html-not-implemented with a few further edits by KWR ''' def clean_html(htmlstr): #from old NLTK package, type str -> str htmlstr = htmlstr.strip() # removes leading and trailing whitespace # First we remove inline JavaScript/CSS: cleaned = re.sub(r"(?is)<(script|style).*?>.*?()", "", htmlstr) # Then we remove html comments. This has to be done before removing regular # tags since comments can contain '>' characters. cleaned = re.sub(r"(?s)[\n]?", "", cleaned) # Next we can remove the remaining tags: cleaned = re.sub(r"(?s)<.*?>", " ", cleaned) # Finally, we deal with whitespace cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) cleaned = re.sub(r" ", " ", cleaned) return cleaned.strip() ''' By user "xperroni" at https://stackoverflow.com/questions/328356/extracting-text-from-html-file-using-python/328969 Does not work as well...? ''' class _DeHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.__text = [] def handle_data(self, data): text = data.strip() if len(text) > 0: text = sub('[ \t\r\n]+', ' ', text) self.__text.append(text + ' ') def handle_starttag(self, tag, attrs): if tag == 'p': self.__text.append("\n\n") elif tag == 'br': self.__text.append("\n") def handle_startendtag(self, tag, attrs): if tag == 'br': self.__text.append("\n\n") def text(self): return ''.join(self.__text).strip() def dehtml(text): try: parser = _DeHTMLParser() parser.feed(text) parser.close() return parser.text() except: print_exc(file=stderr) return text ''' Compile dictionary at location with lines of form word val=#.### category into map entries headDict(word,category) = val The mulDict argument specifies known categories but any new categories encountered are added to it (with default multiplier 1), so it too is returned ''' def loadDict(location, mulDict): # location: URL if starts with http, else treated as local file name # mulDict: map from (category) to float, new cats given multiplier 1.0 heatDict = {} # map from (word,category) to floats, returned along with mulDict entryRE = re.compile('^\s*([A-Za-z]+)\s+(\d\.\d+)\s+([a-z]+)\s*$') if location.startswith('http'): #source = urllib.urlopen(location).read().decode('utf-8').split('\n') source = urllib.request.urlopen(location).read().decode('utf-8').split('\n') #source = urllib.request.get(location).text.split('\n') # Python 3 Trinket else: source = open(location, 'r') for line in source: lineMatch = entryRE.match(line) if lineMatch: word = lineMatch.group(1) val = float(lineMatch.group(2)) cat = lineMatch.group(3) if cat not in mulDict: print("New category",cat,"given multiplier 1.0") mulDict[cat] = 1.0 entry = (word,cat) heatDict[entry] = val # assume any repeated (word,cat) entry overrides the old return (heatDict, mulDict) ''' Compute "heat score" of webpage at url using heatDict and category multipliers in mulDict. Parameters have types string, (string x string) -> float, and string -> float. Returns heatIndex, intenseCountUnique, intenseCount, wordCount, catDict where heatIndex is a float whose sign and meaning depends on the multipliers and the three counts are ints counting occurrences of intense words, occurrences multiplied by the number of categories the word hits, and total body-text words read. Finally catDict is a double-decker map category -> (word -> #timesWordOccurs) which records the read words falling into each category. Technotes: Uses 3rd-party tool to clean HTML frontmatter, endmatter, and markup from webpage. Does not remove hyphens form words; converts underscores to hyphens. If the dictionary removes hyphens this policy should be changed to match it. Changing the tool and policies can affect the results. ''' def heatScore(url, heatDict, mulDict): wordCount = 0 intenseCount = 0 intenseCountUnique = 0 score = 0.0 normalizer = 0.0 for key in mulDict: normalizer += abs(mulDict[key]) catDict = {} # a double-level map cat -> (word -> numTimesItAppears) ALPHALCH = re.compile(r'^[-a-z]+$') # initial r means "raw string" which ALPHANUMH = re.compile(r'^[-0-9A-Za-z]+$') # defends against escaping special chars #page = urllib.urlopen(url) page = urllib.request.urlopen(url) #page = requests.get(url) htmlStr = page.read().decode('utf-8') #htmlStr = page.text pageStr = clean_html(htmlStr) # use discontinued "nltk" routine, included above #pageStr = dehtml(htmlStr) # use 4th-party script pageArray = pageStr.split() for word in pageArray: word = re.sub(r'[-]','_',word) # change hyphens to underscores to save them word = re.sub(r'[^\w]','',word) # one way to strip all non-"word" chars but not '_' word = re.sub(r'[_]+','',word) # all underscores and old hyphens get stripped too. #word = re.sub(r'[_]+','-',word) # all underscores get changed to one hyphen wordlc = word.lower() lastLookup = "" wordlcMatch = ALPHALCH.match(wordlc) if wordlcMatch: wordCount += 1 for cat in mulDict: if (wordlc,cat) in heatDict: intenseCount += 1 if wordlc != lastLookup: intenseCountUnique += 1 lastLookup = wordlc score += mulDict[cat] * heatDict[(wordlc,cat)] if cat in catDict: if wordlc in catDict[cat]: catDict[cat][wordlc] += 1 else: # allocate new catDict[cat][wordlc] = 1 else: # allocate new catDict[cat] = {} # an empty category map catDict[cat][wordlc] = 1 else: # (word does not hit that category) pass # most words will fall thru all categories # end for cat in multDict else: # a stripped word that doesn't match lowercase alpha + hyphen is worth noting if wordlc != "" and not ALPHANUMH.match(word): print("Unusual word",word) # end for heatIndex = score/wordCount if wordCount > 0 else 0.0 heatIndex = 1000.0*heatIndex/normalizer # the 1000.0 is arbitrary return heatIndex, intenseCountUnique, intenseCount, wordCount, catDict