Linear Regression
In the example below a Linear Regression classifier is used on a number of datasets of varying dimensionalities . A second implementation is also shown, using Gradient Descent to perform the regression. The code and output is shown below, and is also available as an iPython notebook and on BitBucket. The data files used in the notebook are included here.
import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets, linear_model
import itertools
Data Extraction & Evaluation Functions¶
For some of the data sets, multiple combinations of features are tried and compared to find the ones where the linear regression performs the best. This can significantly extend the execution time, but the outputted results better demonstrate the effectiveness of the classifier.
# an object representing a data set
class DataSet():
def __init__(self, data, target, headers):
ndxs = random.sample(list(range(len(data))), len(data))
self.data = np.array([data[ndx] for ndx in ndxs])
self.target = np.array([target[ndx] for ndx in ndxs])
self.headers = [h.strip() for h in headers]
def __repr__(self):
d = {'data': self.data, 'target': self.target}
return str(d)
# creates a dataset object from a file
def get_data(fname, columns):
data = []
target = []
with open(fname, 'r') as f:
headers = f.readline().split(',')
for line in f.readlines()[1:]:
cols = line.split(',')
data.append([float(cols[n]) for n in list(range(len(cols))) if n in columns[:-1]])
target.append(float(cols[columns[-1]]))
return DataSet(data, target, headers)
# sum-squared error
def SSE(regr, x, y):
return np.mean((regr.predict(x) - y) ** 2)
def R2(pred, y):
TOP = len(y) * sum([pred[n]*y[n] for n in range(len(y))]) - sum(pred)*sum(y)
BOT = (len(y) * sum([z**2 for z in pred]) - sum(pred)**2) * (len(y) * sum([z**2 for z in y])- sum(y)**2)
r2 = TOP ** 2 / BOT
if r2 == float('Inf'):
r2 = -1
return r2
Linear Regression Implementation¶
class LRClassifier():
def __init__(self):
self.coef_ = None
self.intercept_ = None
# fits the classifier to the data
def fit(self, x, y):
try:
self.coef_ = np.dot(np.linalg.inv(np.dot(np.transpose(x), x)), np.dot(np.transpose(x), y))
self.intercept_ = (sum(y) - sum([np.dot(xval, self.coef_) for xval in x])) / len(x)
sse_cur = SSE(self, x, y)
self.set_vals(-self.coef_, x, y)
sse_flipped = SSE(self, x, y)
if sse_flipped > sse_cur:
self.set_vals(-self.coef_, x, y)
except Exception as ex:
self.coef_ = [0 for v in x]
self.intercept_ = 0
def set_vals(self, coef, x, y):
self.coef_ = coef
self.intercept_ = (sum(y) - np.dot(self.coef_, sum(x))) / len(x)
# uses fitted values to predict new ones
def predict(self, x):
return [(sum([row[cN] * self.coef_[cN] for cN in range(len(row))]) + self.intercept_) for row in x]
# finds all permutations to try
def perms(numUsedVars, numCols, ignore=[]):
vals = []
numbers = [x for x in list(range(numCols)) if x not in ignore]
for combo in itertools.combinations(numbers, numUsedVars):
for item in itertools.permutations(combo):
vals.append(tuple(item))
return vals
# find the best attributes in a dataset
# fname: file storing the data, must have first row of column names and only numeric values
# numVars: how many attributes to look at (4 = 3 predictors and 1 target attribute)
# ignore: column indices to ignore in the data set (non-numeric values, etc.)
def find_best(fname, numVars, ignore=[]):
with open(fname, 'r') as f:
line = f.readline()
num_cols = len(line.split(','))
# try all pairs
pairs = {}
bestPair = None
my_regr = LRClassifier()
for columns in perms(numVars, num_cols, ignore):
# Load a dataset
dataset = get_data(fname, columns)
# Split the data into training/testing sets
train_X = dataset.data[:-40]
test_X = dataset.data[-40:]
train_Y = dataset.target[:-40]
test_Y = dataset.target[-40:]
# use model
my_regr.fit(train_X, train_Y)
pairs[columns] = R2(my_regr.predict(test_X), test_Y)
best = max(pairs, key=pairs.get)
print('Best (' + str(numVars-1) + '): ', ','.join([dataset.headers[x] for x in best[:-1]]), '->',
dataset.headers[best[-1]], '| R^2 =', pairs[best])
return best, pairs[best]
Univariate Data: Happiness Index¶
2016 Happiness Index: Country, GDP, Life Expectancy, etc.
fname = '2016-happy.csv'
# Load a univariate dataset
best, r2 = find_best(fname, 2, [0, 1, 2, 4, 5, 12])
best = [8, 6]
dataset = get_data(fname, best)
print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, dataset.data, dataset.target))
print('\tIntercept: %.2f' % my_regr.intercept_)
# Plot outputs
plt.scatter(dataset.data, dataset.target, color='black')
plt.plot(dataset.data, my_regr.predict(dataset.data), color='red', linewidth=3)
plt.xlabel(dataset.headers[best[0]])
plt.ylabel(dataset.headers[best[1]])
plt.xticks(())
plt.yticks(())
plt.show()
print('Best columns:', best)
Bivariate Data: Cars¶
Cars data set: MPG, weight, acceleration, etc.
fname = 'cars.csv'
# Load a dataset
best, r2 = find_best(fname, 3, [1, 7])
dataset = get_data(fname, best)
# Split the data into training/testing sets
train_X = dataset.data[:-80]
test_X = dataset.data[-80:]
train_Y = dataset.target[:-80]
test_Y = dataset.target[-80:]
print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)
# scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter([x[0] for x in dataset.data], [x[1] for x in dataset.data], dataset.target, color='b')
ax.set_xlabel(dataset.headers[best[0]])
ax.set_ylabel(dataset.headers[best[1]])
ax.set_zlabel(dataset.headers[best[2]])
feat1, feat2 = np.meshgrid([x[0] for x in dataset.data], [x[1] for x in dataset.data])
y = []
for x in dataset.data:
y.append(x * my_regr.coef_)
plt.show()
TODO: plot the plane representing the regression
Multivariate Data: Wine¶
Wine data set: Alcohol content, quality, pH, etc.
# Load a dataset
best = None
bestR2 = None
fname = 'wine.csv'
print('----------Trying-----------')
for score in range(4, 7):
bst, r2 = find_best(fname, score, [1, 2, 4, 5, 7, 9])
if best == None or r2 > bestR2:
best = bst[:]
bestR2 = r2
print('---------------------------')
dataset = get_data(fname, best)
print('FINAL: ', [dataset.headers[x] for x in best[:-1]], '\n\t\t->', dataset.headers[best[-1]], '| R^2 =', bestR2)
# Split the data into training/testing sets
train_X = dataset.data[:-400]
test_X = dataset.data[-400:]
train_Y = dataset.target[:-400]
test_Y = dataset.target[-400:]
print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)
Best attributes for linear regression:¶
fixed acidity, residual sugar, total sulfur dioxide, quality, pH -> alcohol
Categorical Data: Cars (again, but different!)¶
Another car data set: fuel type, number of doors, engine location, price, MPG, etc. Some of this data is categorical, and must be altered to work with linear regression.
# creates a categorical data set
# ':' must not be present in attribute names
def convertCategorical(fname, ignore=[]):
outname = fname + '.out'
with open(fname, 'r') as f:
# headers
headers = [x.strip() for x in f.readline().split(',')]
# find columns
data = [r.strip().split(',') for r in f.readlines()[1:] if '?' not in r]
columns = set()
cVals = {}
for cNdx in range(len(headers)):
numeric = True
values = set()
for row in data:
values.add(row[cNdx])
try:
x = float(row[cNdx])
except:
numeric = False
if not numeric:
columns.add(cNdx)
cVals[cNdx] = [headers[cNdx] + ':' + val for val in values]
# build new dataset
with open(outname, 'w+') as out:
new_headers = []
# get headers
for n in [ndx for ndx in range(len(row)) if ndx not in ignore]:
if n not in columns:
new_headers.append(headers[n])
else:
for v in cVals[n][1:]:
new_headers.append(v)
# get data
lines = []
for row in data:
line = []
for n in [ndx for ndx in range(len(row)) if ndx not in ignore]:
if n not in columns:
line.append(row[n])
else:
for v in cVals[n][1:]:
line.append('1' if row[n] == v.split(':')[1] else '0')
lines.append(line)
# write file
out.write(','.join(new_headers) + '\n')
txt = ''
for line in lines:
txt += ','.join(line) + '\n'
out.write(txt.strip())
return outname
# make categorical data
in_name = 'cars2.csv'
fname = convertCategorical(in_name, [0, 1, 2, 4, 17, 18, 19, 20, 22, 23])
# Load a dataset
cols = [1, 4, 12, 21, 22, 23, 24]
dataset = get_data(fname, cols)
print('Columns: ', ', '.join([dataset.headers[x] for x in cols[:-1]]), '->', dataset.headers[cols[-1]])
# Split the data into training/testing sets
train_X = dataset.data[:-40]
test_X = dataset.data[-40:]
train_Y = dataset.target[:-40]
test_Y = dataset.target[-40:]
print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', '\n\t\t', '\n\t\t'.join([str(coef) for coef in my_regr.coef_]))
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)
my_regr.fit(dataset.data, dataset.target)
print('\tR^2: %.2f' % R2(my_regr.predict(dataset.data), dataset.target))
Gradient Descent¶
The following cell contains the implementation of gradient descent for linear regression. In the results section, any time the results are shown in the form "first number / second number", the first result is from my implementation and the second is from scikit-learn's.
class GradientDescent:
def __init__(self, h, min_h, h_delta, ep, iterations=1000):
self.coef_ = None
self.hStart_ = h
self.hEnd_ = min_h
self.hDelta_ = h_delta
self.ep_ = ep
self.iterations_ = iterations
def __addIntercept(self, data):
return np.array([[1, *d[:]] for d in data])
def predict(self, x):
x = self.__addIntercept(x)
predicted = []
for row in x:
v = 0
for i in range(len(row)):
v += row[i] * self.coef_[i]
predicted.append(v)
predicted = np.array(predicted)
return predicted
return [sum([row[cN] * self.coef_[cN] for cN in range(len(row))]) for row in x]
def __absSum(self, e):
return sum([abs(v) for v in e])
def fit(self, data, tgt):
data = self.__addIntercept(data)
errors = []
# reset vars
self.coef_ = np.ones(shape=(len(data[0]), 1))
self.coef_ = [0 for x in range(len(data[0]))]
prev_error = None
error = None
delta = self.ep_ * 2
iterations = 0
h = self.hStart_
dataSize = float(len(data))
c = 0
lowers = 0
for c in range(self.iterations_):
guess = np.dot(data, self.coef_)
error = guess - tgt
gradient = np.dot(data.transpose(), error) / len(data)
self.coef_ = np.array(self.coef_ - np.array((h * gradient)))
errors.append(error)
# evaluate new error value compared to previous
totError = self.__absSum(error)
if prev_error != None and prev_error - totError < self.ep_:
if h > self.hEnd_:
lowers += 1
h -= self.hDelta_
c = 0
else:
print('Learning rate at minimum, converged after', len(errors), 'iterations.')
print('Lowered learning rate', lowers, 'times.')
break
self.intercept_ = self.coef_[0]
if c == self.iterations_:
print('Hit max iterations')
self.errors = [(i, errors[i]) for i in range(len(errors))]
Gradient Descent: Univariate¶
Identifying linear relatioships between features describing countries in the 2016 "world-happiness" data.
# Load the univariate dataset
fname = '2016-happy.csv'
best = [8, 6]
dataset = get_data(fname, best)
print('\nAnalysis:')
gd = GradientDescent(h=.5, min_h=.001, h_delta=.001, ep=.001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, ' / ', gd.coef_[1:])
print("\tSum squared error:\t %.2f \t\t/ %.2f" % (SSE(my_regr, dataset.data, dataset.target),
SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', 0)#gd.intercept_)
# Plot outputs
plt.scatter(dataset.data, dataset.target, color='black')
plt.plot(dataset.data, my_regr.predict(dataset.data), color='red', linewidth=3)
plt.plot(dataset.data, gd.predict(dataset.data), color='blue', linewidth=3)
plt.xlabel(dataset.headers[best[0]])
plt.ylabel(dataset.headers[best[1]])
plt.xticks(())
plt.yticks(())
plt.title("Relationship between GDP per Capita and Life Expectancy")
print()
plt.show()
The red line in the above figure shows the result from my implementation, and the blue shows the result using scikit-learn.
Gradient Descent: Bivariate¶
Showing the relationship between weight, horsepower, and engine displacement in a car.
np.seterr(invalid='raise')
fname = 'cars.csv'
# Load a dataset
best = [4, 3, 2]
dataset = get_data(fname, best)
print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, ' / ', gd.coef_[1:])
print("\tSum squared error:\t %.2f \t\t/ %.2f" % (SSE(my_regr, dataset.data, dataset.target),
SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)
# scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter([x[0] for x in dataset.data], [x[1] for x in dataset.data], dataset.target, color='b')
ax.set_xlabel(dataset.headers[best[0]])
ax.set_ylabel(dataset.headers[best[1]])
ax.set_zlabel(dataset.headers[best[2]])
feat1, feat2 = np.meshgrid([x[0] for x in dataset.data], [x[1] for x in dataset.data])
y = []
for x in dataset.data:
y.append(x * my_regr.coef_)
plt.title("Relationship between weight, HP, and displacement")
plt.show()
Gradient Descent: Multivariate¶
Using four variables to predict the alcoholic content of a wine.
# Load a dataset
fname = 'wine.csv'
best = [0, 11, 3, 8, 10]
dataset = get_data(fname, best)
print('Attributes: ', [dataset.headers[x] for x in best[:-1]], '\n\t\t->', dataset.headers[best[-1]])
print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, '\n\t\t\t\t', gd.coef_)
print("\tSum squared error:\t %.2f \t\t/ %.2f" % (SSE(my_regr, dataset.data, dataset.target),
SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)
Gradient Descent: Categorical¶
This example also looks at car data, with some non-numerical attributes (body styles can be hatchbacks, etc.). To allow this kind of data to be used in linear regression, the attributes are split into new binary ones representing each possible value.
# make categorical data
in_name = 'cars2.csv'
fname = convertCategorical(in_name, [0, 1, 2, 4, 17, 18, 19, 20, 22, 23])
# Load a dataset
cols = [1, 4, 12, 21, 22, 23, 24]
dataset = get_data(fname, cols)
print('Columns: ', ', '.join([dataset.headers[x] for x in cols[:-1]]), '->', dataset.headers[cols[-1]])
print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoef:\t\t', my_regr.coef_, '\n\t\t\t', gd.coef_)
print("\tSum squared error:\t %.2f \t\t/ %.2f" % (SSE(my_regr, dataset.data, dataset.target),
SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)