''' File "NFLtest.py" by KWR for CSE199, Fall 2017. Activity support file. Requires: file "NFLTeams.xml" in same directory or as peer project file. Requires the "pandas" and "sklearn" packages which are on the UB CSE machines but may be absent from some Python 3 download systems. Usage: python3 NFLtest.py (no other arguments) Illustrates simple linear regression and the R^2 measure of correlation. Tries to "explain" Y = the number of seasons since an NFL team last won a playoff game using X = the number of head coaches the team has had since 1990 and then by trying Z = the population of the team's media market (in millions). Which is more correlated to Y---is it X or Z? Are the correlations significant? ''' from __future__ import division import numpy as np import matplotlib matplotlib.use('Agg') # Used for linux. To make sure the backend is a non-gui backend from matplotlib import pyplot as plt import scipy import scipy.stats, scipy.optimize, scipy.special from scipy.optimize import curve_fit from pylab import * import pandas as pd from sklearn import linear_model import sys import re import xml.etree.ElementTree as ET from html.parser import HTMLParser from traceback import print_exc import urllib ''' If we don't get "matplotlib" working easily in a "sandbox" or "fiddle" environment where you can see the plot without a separate viewer, we can use this crude ASCII text plotting routine. Plotted points are rounded to the nearest "step" value. ''' def crudeplot(XX,YY,xc,yc,xname,yname): X = np.asfarray(XX) Y = np.asfarray(YY) numpts = len(X) maxX = float(max(X)) maxY = float(max(Y)) minX = float(min(X)) minY = float(min(Y)) Yvals = [None for k in range(yc)] xstep = (maxX - minX)/(xc - 1.0) ystep = (maxY - minY)/(yc - 1.0) plotz = [[' ' for j in range(xc)] for i in range(yc)] for i in range(numpts): xcol = round(float((X[i] - minX)/xstep)) ycol = round(float((maxY - Y[i])/ystep)) Yvals[ycol] = Y[i] plotz[ycol][xcol] = '*' plotstr = "\n/" + str(minY) + " ... " + yname + " ... " + str(maxY) + "\n" for i in range(yc): yval = " " + str(float(Yvals[i])) if Yvals[i] is not None else "" plotstr += '|' + "".join(plotz[i]) + yval + "\n" plotstr += '\\' + "".join(['-' for k in range(xc)]) + "\n" plotstr += " " + str(minX) + " ... " + xname + " ... " + str(maxX) + "\n" return plotstr ''' Parser for XML nested no further than a simple 2-diemnsional table. Formats the table with named column fields in a "Pandas DataFrame" object. Later we extract columns into simple Python arrays. ''' def xml2df(xml_data, attributesAreFields, convertNumeric=True): root = ET.XML(xml_data) # element tree #root = ET.parse(xml_data) all_records = [] for i, child in enumerate(root): if attributesAreFields: all_records.append(child.attrib) else: record = {} for subchild in child: record[subchild.tag] = subchild.text all_records.append(record) df = pd.DataFrame(all_records) if convertNumeric: #df = df.apply(pd.to_numeric, errors='ignore') #needed for Python3 Trinket df = df.convert_objects(convert_numeric=True) #needed on CSE machines return df # main #location = 'NFLTeams.xml' location = "https://www.cse.buffalo.edu/~regan/cse199/NFLTeams.xml" #location = "https://www.cse.buffalo.edu/~regan/cse199/NFLTeamsNoCIN.xml" if location.startswith('http'): #source = urllib.urlopen(location).read().decode('utf-8').split('\n') source = urllib.request.urlopen(location).read().decode('utf-8') #source = requests.get(location).text.split('\n') # Python 3 Trinket else: source = open(location, 'r').read() #.decode('utf-8') #don't! NFLdf = xml2df(source, True) # Illustrating that arrays can do +- or */ with scalars Yarr = 2016 - np.array(NFLdf['ylpw']) # changed from 'lastPlayoffWin' Zarr = np.array(NFLdf['pop'])/1000000.0 # Extend a DataFrame with new columns---the "NFLdf = " part is needed NFLdf = NFLdf.assign(drought=Yarr, mpop=Zarr) # Columns of the new data object still need to be cast as DataFrames X = pd.DataFrame(NFLdf['cs90']) # X = pd.DataFrame(NFLdf.cs90) works too Y = pd.DataFrame(NFLdf.drought) # or = pd.DataFrame(NFLdf['drought']) but not pd.DataFrame(Yarr) Z = pd.DataFrame(NFLdf.mpop) # or could use Zarr # When you make a table with 2 columns the casts start to make sense W = pd.DataFrame(NFLdf, columns=['mpop','cs90']) print(crudeplot(NFLdf['cs90'],Yarr,61,27,"Coaches since 1990","Years since last playoff win")) print(crudeplot(Zarr,Yarr,61,27,"Population in millions","Years since last playoff win")) lm = linear_model.LinearRegression() model1 = lm.fit(X,Y) intercept1 = model1.intercept_[0] slope1 = model1.coef_[0][0] score1 = model1.score(X,Y) pred1 = model1.predict(X) fig1ax = NFLdf.plot(kind='scatter', x='cs90', y='drought', color='blue') plt.plot(X, pred1, color='green', linewidth=1) # here could use NFLdf.cs90 w/o the cast instead of X plt.axis([0, 1+NFLdf.cs90.max(), -1, 1+NFLdf.drought.max()]) # But here X does NOT work! plt.xticks(np.arange(1+NFLdf.cs90.max())) plt.yticks(np.arange(1+NFLdf.drought.max())) plt.xlabel('Number of coaches since 1990') plt.ylabel('Number of seasons since last playoff win') plt.title('Playoff Drought Versus Coaching Changes') savefig('XYplot.png') model2 = lm.fit(Z,Y) # Though called "Z", the source variable still comes first intercept2 = model2.intercept_[0] slope2 = model2.coef_[0][0] score2 = model1.score(Z,Y) pred2 = model2.predict(Z) fig2ax = NFLdf.plot(kind='scatter', x='mpop', y='drought', color='blue') plt.plot(Z, pred2, color='green', linewidth=1) plt.axis([0, 1+NFLdf.mpop.max(), -1, 1+NFLdf.drought.max()]) # But here Z does NOT work! plt.xticks(np.arange(1+NFLdf.mpop.max())) plt.yticks(np.arange(1+NFLdf.drought.max())) plt.xlabel('Media market population in millions') plt.ylabel('Number of seasons since last playoff win') plt.title('Playoff Drought Versus Media Market Size') savefig('ZYplot.png') model3 = lm.fit(W,Y) intercept3 = model3.intercept_[0] slope31 = model3.coef_[0][0] slope32 = model3.coef_[0][1] score3 = model3.score(W,Y) print("\nIgnore any warning(s) above") print() print() print("Results for drought versus coaches:") print("Drought in years = ", format(intercept1,'.3f'), " + ", format(slope1,'.3f'), \ "*(number of coaches since 1990) with score R^2 = ", format(score1,'.4f'),sep='') #format(model1.score(pd.DataFrame(NFLdf.cs90), pd.DataFrame(NFLdf.drought)),'.4f'),sep='') print() print("Results for drought versus population:") print("Drought in years = ", format(intercept2,'.3f'), " + ", format(slope2,'.3f'), \ "*(media market pop in millions) with score R^2 = ", format(score2,'.4f'),sep='') print() print("Results for drought versus both:") print("Drought in years = ", format(intercept3,'.3f'), " + ", format(slope31,'.3f'), \ "*(media market pop in millions) + ", format(slope32,'.3f'), \ "*(cs90) with score R^2 = ", format(score3,'.4f'),sep='') print()