'''
File "NFLtest.py" by KWR for CSE199, Fall 2022.  File for Internet and Data Homework 2.
Requires: file "NFLTeams.xml" accessible via web (for Python Trinket web use)
or in same directory or otherwise listed in an IDE as a peer project file.
Requires the "pandas" and "sklearn" packages, which are on the UB CSE machines
and accessible via Python 3 Trinket but may be absent from some Python 3 downloads.

Usage: Runs automatically in Python 3 Trinket, should work similarly in other web apps.
Command-line usage: python3 NFLtest.py       (no other arguments)

Illustrates simple linear regression and the R^2 measure of correlation.
Tries to "explain" Y = the number of seasons since an NFL team last won a 
playoff game using X = the number of head coaches the team has had since 1990
and then by trying Z = the population of the team's media market (in millions).
Which is more correlated to Y---is it X or Z?  Are the correlations significant?
'''

from __future__ import division
import numpy as np
import matplotlib
matplotlib.use('Agg')  # Used for linux. To make sure the backend is a non-gui backend
from matplotlib import pyplot as plt
import scipy
import scipy.stats, scipy.optimize, scipy.special
from scipy.optimize import curve_fit
from pylab import *
import pandas as pd
from sklearn import linear_model

import sys
import re
import xml.etree.ElementTree as ET
from html.parser import HTMLParser
from traceback import print_exc
#import urllib
import urllib.request    #seems required in 2022


'''
If we don't get "matplotlib" working easily in a "sandbox" or "fiddle" environment
where you can see the plot without a separate viewer, we can use this crude ASCII
text plotting routine.  Plotted points are rounded to the nearest "step" value.
'''
def crudeplot(XX,YY,xc,yc,xname,yname):
    X = np.asfarray(XX)
    Y = np.asfarray(YY)
    numpts = len(X)
    maxX = float(max(X))
    maxY = float(max(Y))
    minX = float(min(X))
    minY = float(min(Y))
    Yvals = [None for k in range(yc)]
    xstep = (maxX - minX)/(xc - 1.0)
    ystep = (maxY - minY)/(yc - 1.0)
    plotz = [[' ' for j in range(xc)] for i in range(yc)]
    for i in range(numpts):
        xcol = round(float((X[i] - minX)/xstep))
        ycol = round(float((maxY - Y[i])/ystep))
        Yvals[ycol] = Y[i]
        plotz[ycol][xcol] = '*'
    plotstr = "\n/" + str(minY) + " ... " + yname + " ... " + str(maxY) + "\n"
    for i in range(yc):
        yval = "  " + str(float(Yvals[i])) if Yvals[i] is not None else ""
        plotstr += '|' + "".join(plotz[i]) + yval + "\n"
    plotstr += '\\' + "".join(['-' for k in range(xc)]) + "\n"
    plotstr += " " + str(minX) + " ... " + xname + " ... " + str(maxX) + "\n"
    return plotstr

'''
Parser for XML nested no further than a simple 2-diemnsional table.
Formats the table with named column fields in a "Pandas DataFrame" object.
Later we extract columns into simple Python arrays.
'''

def xml2df(xml_data, attributesAreFields, convertNumeric=True):
    root = ET.XML(xml_data) # element tree
    #root = ET.parse(xml_data)
    all_records = []
    for i, child in enumerate(root):
        if attributesAreFields:
            all_records.append(child.attrib)
        else:
            record = {}
            for subchild in child:
                record[subchild.tag] = subchild.text
                all_records.append(record)
    
    df = pd.DataFrame(all_records)
    if convertNumeric:
        df = df.apply(pd.to_numeric, errors='ignore')   #needed for Python3 Trinket
        #df = df.convert_objects(convert_numeric=True)    #needed on CSE machines
    return df
    

# main

#location = 'NFLTeams.xml'
location = "https://www.cse.buffalo.edu/~regan/cse199/NFLTeams.xml"
#location = "https://www.cse.buffalo.edu/~regan/cse199/NFLTeamsNoCIN.xml"   #2017
#location = "https://www.cse.buffalo.edu/~regan/cse199/NFLTeamsNoDET.xml"   #use in 2022

#you might need to vary the next lines on alternate systems
if location.startswith('http'):
      #source = urllib.urlopen(location).read().decode('utf-8').split('\n')
      source = urllib.request.urlopen(location).read().decode('utf-8')
      #source = requests.get(location).text.split('\n')   # Python 3 Trinket
else:
      source = open(location, 'r').read()   #.decode('utf-8')   #don't!
      
NFLdf = xml2df(source, True)

# Illustrating that arrays can do +- or */ with scalars
Yarr = 2021 - np.array(NFLdf['ylpw'])   # changed from 'lastPlayoffWin'
#Zarr = np.array(NFLdf['pop'])/1000000.0
Zarr = np.array(NFLdf['pop'].str.replace(',', '').astype(int))/1000000.0

#2022 NOTE^^^^: Commas inside integers may not be parsed automatically on some systems
#The Yarr line still seems to be accepted as-is, since the years don't have commas.


# Extend a DataFrame with new columns---the "NFLdf = " part is needed
NFLdf = NFLdf.assign(drought=Yarr, mpop=Zarr)

# Columns of the new data object still need to be cast as DataFrames
X = pd.DataFrame(NFLdf['cs2010'])   # X = pd.DataFrame(NFLdf.cs2010) works too
Y = pd.DataFrame(NFLdf.drought)   # or = pd.DataFrame(NFLdf['drought']) but not pd.DataFrame(Yarr)
Z = pd.DataFrame(NFLdf.mpop)      # or could use Zarr

# When you make a table with 2 columns the casts start to make sense
W = pd.DataFrame(NFLdf, columns=['mpop','cs2010'])

print(crudeplot(NFLdf['cs2010'],Yarr,61,27,"Coaches since 2010","Years since last playoff win"))
print(crudeplot(Zarr,Yarr,61,27,"Population in millions","Years since last playoff win"))


lm = linear_model.LinearRegression()

model1 = lm.fit(X,Y)
intercept1 = model1.intercept_[0]
slope1 = model1.coef_[0][0]
score1 = model1.score(X,Y)

pred1 = model1.predict(X)
fig1ax = NFLdf.plot(kind='scatter', x='cs2010', y='drought', color='blue')

plt.plot(X, pred1, color='green', linewidth=1)  # here could use NFLdf.cs2010 w/o the cast instead of X
plt.axis([0, 1+NFLdf.cs2010.max(), -1, 1+NFLdf.drought.max()])   # But here X does NOT work!
plt.xticks(np.arange(1+NFLdf.cs2010.max()))
plt.yticks(np.arange(1+NFLdf.drought.max()))
plt.xlabel('Number of coaches since 2010')
plt.ylabel('Number of seasons since last playoff win')
plt.title('Playoff Drought Versus Coaching Changes')

savefig('XYplot.png')


model2 = lm.fit(Z,Y)    # Though called "Z", the source variable still comes first
intercept2 = model2.intercept_[0]
slope2 = model2.coef_[0][0]
score2 = model1.score(Z,Y)

pred2 = model2.predict(Z)
fig2ax = NFLdf.plot(kind='scatter', x='mpop', y='drought', color='blue')

plt.plot(Z, pred2, color='green', linewidth=1)  
plt.axis([0, 1+NFLdf.mpop.max(), -1, 1+NFLdf.drought.max()])   # But here Z does NOT work!
plt.xticks(np.arange(1+NFLdf.mpop.max()))
plt.yticks(np.arange(1+NFLdf.drought.max()))
plt.xlabel('Media market population in millions')
plt.ylabel('Number of seasons since last playoff win')
plt.title('Playoff Drought Versus Media Market Size')

savefig('ZYplot.png')

model3 = lm.fit(W,Y)
intercept3 = model3.intercept_[0]
slope31 = model3.coef_[0][0]
slope32 = model3.coef_[0][1]
score3 = model3.score(W,Y)

print("\nIgnore any warning(s) above")
print()
print()
print("Results for drought versus coaches:")
print("Drought in years = ", format(intercept1,'.3f'), " + ", format(slope1,'.3f'), \
      "*(number of coaches since 2010) with score R^2 = ", format(score1,'.4f'),sep='')
      #format(model1.score(pd.DataFrame(NFLdf.cs90), pd.DataFrame(NFLdf.drought)),'.4f'),sep='')
print()
print("Results for drought versus population:")
print("Drought in years = ", format(intercept2,'.3f'), " + ", format(slope2,'.3f'), \
      "*(media market pop in millions) with score R^2 = ", format(score2,'.4f'),sep='')
print()
print("Results for drought versus both:")
print("Drought in years = ", format(intercept3,'.3f'), " + ", format(slope31,'.3f'), \
      "*(media market pop in millions) + ", format(slope32,'.3f'), \
      "*(cs2010) with score R^2 = ", format(score3,'.4f'),sep='')
print()