import random
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from sklearn import datasets, linear_model
import itertools

Data Extraction & Evaluation Functions¶

For some of the data sets, multiple combinations of features are tried and compared to find the ones where the linear regression performs the best. This can significantly extend the execution time, but the outputted results better demonstrate the effectiveness of the classifier.

# an object representing a data set
class DataSet():
    def __init__(self, data, target, headers):
        ndxs = random.sample(list(range(len(data))), len(data))
        self.data = np.array([data[ndx] for ndx in ndxs])
        self.target = np.array([target[ndx] for ndx in ndxs])
        self.headers = [h.strip() for h in headers]
    def __repr__(self):
        d = {'data': self.data, 'target': self.target}
        return str(d)

# creates a dataset object from a file
def get_data(fname, columns):
    data = []
    target = []
    with open(fname, 'r') as f:
        headers = f.readline().split(',')
        for line in f.readlines()[1:]:
            cols = line.split(',')
            data.append([float(cols[n]) for n in list(range(len(cols))) if n in columns[:-1]])
            target.append(float(cols[columns[-1]]))
    return DataSet(data, target, headers)

# sum-squared error
def SSE(regr, x, y):
    return np.mean((regr.predict(x) - y) ** 2)

def R2(pred, y):
    TOP = len(y) * sum([pred[n]*y[n] for n in range(len(y))]) - sum(pred)*sum(y)
    BOT = (len(y) * sum([z**2 for z in pred]) - sum(pred)**2) * (len(y) * sum([z**2 for z in y])- sum(y)**2)
    r2 = TOP ** 2 / BOT
    if r2 == float('Inf'):
        r2 = -1
    return r2

Linear Regression Implementation¶

class LRClassifier():
    def __init__(self):
        self.coef_ = None
        self.intercept_ = None
    
    # fits the classifier to the data
    def fit(self, x, y):
        try:
            self.coef_ = np.dot(np.linalg.inv(np.dot(np.transpose(x), x)), np.dot(np.transpose(x), y))
            self.intercept_ = (sum(y) - sum([np.dot(xval, self.coef_) for xval in x])) / len(x)
            sse_cur = SSE(self, x, y)
            self.set_vals(-self.coef_, x, y)
            sse_flipped = SSE(self, x, y)
            if sse_flipped > sse_cur:
                self.set_vals(-self.coef_, x, y)
        except Exception as ex:
            self.coef_ = [0 for v in x]
            self.intercept_ = 0
    
    def set_vals(self, coef, x, y):
        self.coef_ = coef
        self.intercept_ = (sum(y) - np.dot(self.coef_, sum(x))) / len(x)
    
    # uses fitted values to predict new ones
    def predict(self, x):
        return [(sum([row[cN] * self.coef_[cN] for cN in range(len(row))]) + self.intercept_) for row in x]

# finds all permutations to try
def perms(numUsedVars, numCols, ignore=[]):
    vals = []
    numbers = [x for x in list(range(numCols)) if x not in ignore]
    for combo in itertools.combinations(numbers, numUsedVars):
        for item in itertools.permutations(combo):
            vals.append(tuple(item))
    return vals

# find the best attributes in a dataset
#     fname: file storing the data, must have first row of column names and only numeric values
#     numVars: how many attributes to look at (4 = 3 predictors and 1 target attribute)
#     ignore: column indices to ignore in the data set (non-numeric values, etc.)
def find_best(fname, numVars, ignore=[]):
    with open(fname, 'r') as f:
        line = f.readline()
        num_cols = len(line.split(','))
    # try all pairs
    pairs = {}
    bestPair = None
    my_regr = LRClassifier()
    for columns in perms(numVars, num_cols, ignore):
        # Load a dataset
        dataset = get_data(fname, columns)
        # Split the data into training/testing sets
        train_X = dataset.data[:-40]
        test_X = dataset.data[-40:]
        train_Y = dataset.target[:-40]
        test_Y = dataset.target[-40:]
        # use model
        my_regr.fit(train_X, train_Y)
        pairs[columns] = R2(my_regr.predict(test_X), test_Y)
    best = max(pairs, key=pairs.get)
    print('Best (' + str(numVars-1) + '): ', ','.join([dataset.headers[x] for x in best[:-1]]), '->',
          dataset.headers[best[-1]], '| R^2 =', pairs[best])
    return best, pairs[best]

Univariate Data: Happiness Index¶

2016 Happiness Index: Country, GDP, Life Expectancy, etc.

fname = '2016-happy.csv'
# Load a univariate dataset
best, r2 = find_best(fname, 2, [0, 1, 2, 4, 5, 12])
best = [8, 6]
dataset = get_data(fname, best)

print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, dataset.data, dataset.target))
print('\tIntercept: %.2f' % my_regr.intercept_)

# Plot outputs
plt.scatter(dataset.data, dataset.target, color='black')
plt.plot(dataset.data, my_regr.predict(dataset.data), color='red', linewidth=3)
plt.xlabel(dataset.headers[best[0]])
plt.ylabel(dataset.headers[best[1]])

plt.xticks(())
plt.yticks(())
plt.show()
print('Best columns:', best)

Best (1):  Happiness Score -> Life Expectancy | R^2 = 0.696217772811

Analysis:
	Coefficients:  [ 1.67968167]
	Sum squared error: 0.05
	Intercept: 0.02

Best columns: [8, 6]

Bivariate Data: Cars¶

Cars data set: MPG, weight, acceleration, etc.

fname = 'cars.csv'
# Load a dataset
best, r2 = find_best(fname, 3, [1, 7])
dataset = get_data(fname, best)
# Split the data into training/testing sets
train_X = dataset.data[:-80]
test_X = dataset.data[-80:]
train_Y = dataset.target[:-80]
test_Y = dataset.target[-80:]
print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)

# scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter([x[0] for x in dataset.data], [x[1] for x in dataset.data], dataset.target, color='b')
ax.set_xlabel(dataset.headers[best[0]])
ax.set_ylabel(dataset.headers[best[1]])
ax.set_zlabel(dataset.headers[best[2]])

feat1, feat2 = np.meshgrid([x[0] for x in dataset.data], [x[1] for x in dataset.data])
y = []
for x in dataset.data:
    y.append(x * my_regr.coef_)

plt.show()

Best (2):  Acceleration,Weight -> Displacement | R^2 = 0.929482201308

Analysis:
	Coefficients:  [ 0.10395052 -7.44019915]
	Sum squared error: 979.71
	Intercept: 0.02

TODO: plot the plane representing the regression

Multivariate Data: Wine¶

Wine data set: Alcohol content, quality, pH, etc.

# Load a dataset
best = None
bestR2 = None
fname = 'wine.csv'
print('----------Trying-----------')
for score in range(4, 7):
    bst, r2 = find_best(fname, score, [1, 2, 4, 5, 7, 9])
    if best == None or r2 > bestR2:
        best = bst[:]
        bestR2 = r2
print('---------------------------')
dataset = get_data(fname, best)
print('FINAL: ', [dataset.headers[x] for x in best[:-1]], '\n\t\t->', dataset.headers[best[-1]], '| R^2 =', bestR2)

# Split the data into training/testing sets
train_X = dataset.data[:-400]
test_X = dataset.data[-400:]
train_Y = dataset.target[:-400]
test_Y = dataset.target[-400:]

print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', my_regr.coef_)
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)

----------Trying-----------
Best (3):  total sulfur dioxide,alcohol,fixed acidity -> residual sugar | R^2 = 0.60187766013
Best (4):  fixed acidity,total sulfur dioxide,residual sugar,alcohol -> quality | R^2 = 0.617622428012
Best (5):  pH,fixed acidity,total sulfur dioxide,alcohol,residual sugar -> quality | R^2 = 0.670502181858
---------------------------
FINAL:  ['pH', 'fixed acidity', 'total sulfur dioxide', 'alcohol', 'residual sugar'] 
		-> quality | R^2 = 0.670502181858

Analysis:
	Coefficients:  [ 0.0930147  -0.00351125 -0.00138262  0.30008556  0.3777857 ]
	Sum squared error: 0.56
	Intercept: 0.00

Best attributes for linear regression:¶

fixed acidity, residual sugar, total sulfur dioxide, quality, pH -> alcohol

Categorical Data: Cars (again, but different!)¶

Another car data set: fuel type, number of doors, engine location, price, MPG, etc. Some of this data is categorical, and must be altered to work with linear regression.

# creates a categorical data set
# ':' must not be present in attribute names
def convertCategorical(fname, ignore=[]):
    outname = fname + '.out'
    with open(fname, 'r') as f:
        # headers
        headers = [x.strip() for x in f.readline().split(',')]

        # find columns
        data = [r.strip().split(',') for r in f.readlines()[1:] if '?' not in r]
        columns = set()
        cVals = {}
        for cNdx in range(len(headers)):
            numeric = True
            values = set()
            for row in data:
                values.add(row[cNdx])
                try:
                    x = float(row[cNdx])
                except:
                    numeric = False
            if not numeric:
                columns.add(cNdx)
                cVals[cNdx] = [headers[cNdx] + ':' + val for val in values]
        # build new dataset
        with open(outname, 'w+') as out:
            new_headers = []
            # get headers
            for n in [ndx for ndx in range(len(row)) if ndx not in ignore]:
                if n not in columns:
                    new_headers.append(headers[n])
                else:
                    for v in cVals[n][1:]:
                        new_headers.append(v)
            # get data
            lines = []
            for row in data:
                line = []
                for n in [ndx for ndx in range(len(row)) if ndx not in ignore]:
                    if n not in columns:
                        line.append(row[n])
                    else:
                        for v in cVals[n][1:]:
                            line.append('1' if row[n] == v.split(':')[1] else '0')
                lines.append(line)
            # write file
            out.write(','.join(new_headers) + '\n')
            txt = ''
            for line in lines:
                txt += ','.join(line) + '\n'
            out.write(txt.strip())
    return outname

# make categorical data
in_name = 'cars2.csv'
fname = convertCategorical(in_name, [0, 1, 2, 4, 17, 18, 19, 20, 22, 23])

# Load a dataset
cols = [1, 4, 12, 21, 22, 23, 24]
dataset = get_data(fname, cols)
print('Columns: ', ', '.join([dataset.headers[x] for x in cols[:-1]]), '->', dataset.headers[cols[-1]])

# Split the data into training/testing sets
train_X = dataset.data[:-40]
test_X = dataset.data[-40:]
train_Y = dataset.target[:-40]
test_Y = dataset.target[-40:]

print('\nAnalysis:')
my_regr = LRClassifier()
my_regr.fit(train_X, train_Y)
print('\tCoefficients: ', '\n\t\t', '\n\t\t'.join([str(coef) for coef in my_regr.coef_]))
print("\tSum squared error: %.2f" % SSE(my_regr, test_X, test_Y))
print('\tIntercept: %.2f' % my_regr.intercept_)
my_regr.fit(dataset.data, dataset.target)
print('\tR^2: %.2f' % R2(my_regr.predict(dataset.data), dataset.target))

Columns:  num-of-doors:two, body-style:wagon, curb-weight, engine-size, horsepower, highway-mpg -> price

Analysis:
	Coefficients:  
		 284.372851645
		-1984.69393434
		6.17623858783
		44.8269729406
		-16.0689779197
		-234.227134403
	Sum squared error: 8576072.96
	Intercept: -51.21
	R^2: 0.80

Gradient Descent¶

The following cell contains the implementation of gradient descent for linear regression. In the results section, any time the results are shown in the form "first number / second number", the first result is from my implementation and the second is from scikit-learn's.

class GradientDescent:
    def __init__(self, h, min_h, h_delta, ep, iterations=1000):
        self.coef_ = None
        self.hStart_ = h
        self.hEnd_ = min_h
        self.hDelta_ = h_delta
        self.ep_ = ep
        self.iterations_ = iterations
        
    def __addIntercept(self, data):
        return np.array([[1, *d[:]] for d in data])
        
    def predict(self, x):
        x = self.__addIntercept(x)
        
        predicted = []
        
        for row in x:
            v = 0
            for i in range(len(row)):
                v += row[i] * self.coef_[i]
            predicted.append(v)
        predicted = np.array(predicted)
        return predicted
        
        return [sum([row[cN] * self.coef_[cN] for cN in range(len(row))]) for row in x]
    
    def __absSum(self, e):
        return sum([abs(v) for v in e])
    
    def fit(self, data, tgt):
        data = self.__addIntercept(data)
        errors = []
        # reset vars
        self.coef_ = np.ones(shape=(len(data[0]), 1)) 
        self.coef_ = [0 for x in range(len(data[0]))]
        prev_error = None
        error = None
        delta = self.ep_ * 2
        iterations = 0
        h = self.hStart_
        dataSize = float(len(data))
        c = 0
        lowers = 0
        
        for c in range(self.iterations_):
            guess = np.dot(data, self.coef_)
            error = guess - tgt
            gradient = np.dot(data.transpose(), error) / len(data)
            self.coef_ = np.array(self.coef_ - np.array((h * gradient)))
            
            errors.append(error)
            # evaluate new error value compared to previous
            totError = self.__absSum(error)
            if prev_error != None and prev_error - totError < self.ep_:
                if h > self.hEnd_:
                    lowers += 1
                    h -= self.hDelta_
                    c = 0
                else:
                    print('Learning rate at minimum, converged after', len(errors), 'iterations.')
                    print('Lowered learning rate', lowers, 'times.')
                    break
            
        self.intercept_ = self.coef_[0]
        if c == self.iterations_:
            print('Hit max iterations')
        self.errors = [(i, errors[i]) for i in range(len(errors))]

Gradient Descent: Univariate¶

Identifying linear relatioships between features describing countries in the 2016 "world-happiness" data.

# Load the univariate dataset
fname = '2016-happy.csv'
best = [8, 6]
dataset = get_data(fname, best)

print('\nAnalysis:')
gd = GradientDescent(h=.5, min_h=.001, h_delta=.001, ep=.001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, ' / ', gd.coef_[1:])
print("\tSum squared error:\t %.2f \t\t/  %.2f" % (SSE(my_regr, dataset.data, dataset.target),
                                            SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', 0)#gd.intercept_)

# Plot outputs
plt.scatter(dataset.data, dataset.target, color='black')
plt.plot(dataset.data, my_regr.predict(dataset.data), color='red', linewidth=3)
plt.plot(dataset.data, gd.predict(dataset.data), color='blue', linewidth=3)
plt.xlabel(dataset.headers[best[0]])
plt.ylabel(dataset.headers[best[1]])
plt.xticks(())
plt.yticks(())
plt.title("Relationship between GDP per Capita and Life Expectancy")
print()
plt.show()

Analysis:
	Coefficients:		 [ 1.67968167]  /  [ 1.50204731]
	Sum squared error:	 0.05 		/  0.05
	Intercept:		 0.02 		/  0

The red line in the above figure shows the result from my implementation, and the blue shows the result using scikit-learn.

Gradient Descent: Bivariate¶

Showing the relationship between weight, horsepower, and engine displacement in a car.

np.seterr(invalid='raise')
fname = 'cars.csv'
# Load a dataset
best = [4, 3, 2]
dataset = get_data(fname, best)

print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, ' / ', gd.coef_[1:])
print("\tSum squared error:\t %.2f \t\t/  %.2f" % (SSE(my_regr, dataset.data, dataset.target),
                                            SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)

# scatter plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter([x[0] for x in dataset.data], [x[1] for x in dataset.data], dataset.target, color='b')
ax.set_xlabel(dataset.headers[best[0]])
ax.set_ylabel(dataset.headers[best[1]])
ax.set_zlabel(dataset.headers[best[2]])

feat1, feat2 = np.meshgrid([x[0] for x in dataset.data], [x[1] for x in dataset.data])
y = []
for x in dataset.data:
    y.append(x * my_regr.coef_)

plt.title("Relationship between weight, HP, and displacement")
plt.show()

Analysis:
	Coefficients:		 [ 1.29692444  0.02306431]  /  [ 0.05130454  0.06714501]
	Sum squared error:	 2305.90 		/  3002.18
	Intercept:		 -9.93 		/  -0.00108791926546

Gradient Descent: Multivariate¶

Using four variables to predict the alcoholic content of a wine.

# Load a dataset
fname = 'wine.csv'
best = [0, 11, 3, 8, 10]
dataset = get_data(fname, best)
print('Attributes: ', [dataset.headers[x] for x in best[:-1]], '\n\t\t->', dataset.headers[best[-1]])

print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoefficients:\t\t', my_regr.coef_, '\n\t\t\t\t', gd.coef_)
print("\tSum squared error:\t %.2f \t\t/  %.2f" % (SSE(my_regr, dataset.data, dataset.target),
                                            SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)

Attributes:  ['fixed acidity', 'quality', 'residual sugar', 'pH'] 
		-> alcohol

Analysis:
	Coefficients:		 [ 0.03881275  0.03916165  1.93595834  0.63717598] 
				 [ 0.00103616  0.00860823  0.00263692  0.0034342   0.00588092]
	Sum squared error:	 0.81 		/  107.22
	Intercept:		 -0.00 		/  0.00103616236722

Gradient Descent: Categorical¶

This example also looks at car data, with some non-numerical attributes (body styles can be hatchbacks, etc.). To allow this kind of data to be used in linear regression, the attributes are split into new binary ones representing each possible value.

# make categorical data
in_name = 'cars2.csv'
fname = convertCategorical(in_name, [0, 1, 2, 4, 17, 18, 19, 20, 22, 23])

# Load a dataset
cols = [1, 4, 12, 21, 22, 23, 24]
dataset = get_data(fname, cols)
print('Columns: ', ', '.join([dataset.headers[x] for x in cols[:-1]]), '->', dataset.headers[cols[-1]])

print('\nAnalysis:')
gd = GradientDescent(h=.0000001, min_h=.000000001, h_delta=.00000001, ep=.00000001)
gd.fit(dataset.data, dataset.target)
my_regr = LRClassifier()
my_regr.fit(dataset.data, dataset.target)
print('\tCoef:\t\t', my_regr.coef_, '\n\t\t\t', gd.coef_)
print("\tSum squared error:\t %.2f \t\t/  %.2f" % (SSE(my_regr, dataset.data, dataset.target),
                                                    SSE(gd, dataset.data, dataset.target)))
print('\tIntercept:\t\t %.2f' % my_regr.intercept_, '\t\t/ ', gd.intercept_)

Columns:  num-of-doors:two, body-style:wagon, curb-weight, engine-size, horsepower, highway-mpg -> price

Analysis:
	Coef:		 [  1.86695614e+00  -2.01916742e+03   6.49438798e+00   4.11689146e+01
  -1.53669315e+01  -2.40982212e+02] 
			 [-0.05452449 -0.03855563 -0.01601213  4.7064428   2.11598083  2.65152722
 -3.26034606]
	Sum squared error:	 6817809.79 		/  15225208.60
	Intercept:		 -46.98 		/  -0.0545244918157