import urllib
import numpy as np
from sklearn import datasets, linear_model
from math import sqrt
import matplotlib.pyplot as plot
#read data into iterable
target_url = "http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
data = urllib.request.urlopen(target_url)
xList = []
labels = []
names = []
firstLine = True
for line in data:
if firstLine:
names = str(line).strip().split(";")
firstLine = False
else:
row = str(line).strip("\\n'").split(';')
labels.append(row[-1])
row.pop()
floatRow = []
for num in row:
if "b'" in num:
num = num.replace("b'",'')
floatRow.append(float(num))
xList.append(floatRow)
nrows = len(xList)
ncols = len(xList[0])
xMeans = np.array(xList).mean(axis=0)
xSD = np.array(xList).std(axis=0)
xNormalized = []
for i in range(nrows):
rowN = [(xList[i][j] - xMeans[j])/xSD[j] for j in range(ncols)]
xNormalized.append(rowN)
labels = [float(labels[i]) for i in range(nrows)]
meanLabel = sum(labels)/nrows
sdLabel = sqrt(sum([(labels[i] - meanLabel) * (labels[i] - meanLabel) for i in range(nrows)])/nrows)
labelNormalize = [(labels[i] - meanLabel)/sdLabel for i in range(nrows)]
beta = [0.0] * ncols
betaMat = []
betaMat.append(list(beta))
nSteps = 350
StepSize = 0.004
for i in range(nSteps):
residuals = [0.0] * nrows
for j in range(nrows):
labelsHat = sum([xNormalized[j][k] * beta[k] for k in range(ncols)])
residuals[j] = labelNormalize[j] - labelsHat
corr = [0.0] * ncols
for j in range(ncols):
corr[j] = sum([xNormalized[k][j] * residuals[k] for k in range(nrows)]) / nrows
iStar = 0
corrStar = corr[0]
for j in range(1,(ncols)):
if abs(corrStar) < abs(corr[j]):
iStar = j
corrStar = corr[j]
beta[iStar] += StepSize * corrStar /abs(corrStar)
betaMat.append(list(beta))
for i in range(ncols):
coefCurve = [betaMat[k][i] for k in range(nSteps)]
xaxis = range(nSteps)
plot.plot(xaxis,coefCurve)
plot.show()