import numpy as np
from scipy.stats import zscore
from numpy.linalg import norm

import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import cross_validate, validation_curve

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.neighbors import KNeighborsRegressor


colleges_raw = pd.read_csv('College_Data.csv', index_col=0) # load dataset, index by school
colleges_raw.head(5)


udel = colleges_raw.loc[['University of Delaware']]
udel


# Clean data
colleges = colleges_raw # copy data
colleges = pd.get_dummies(colleges, columns = ['Private'], drop_first = True) # convert private to dummy

# Rename columns
colleges = colleges.rename(columns={"Top10perc" : "Top10pct", "Top25perc" : "Top25pct", "F.Undergrad" : "UndergradF", "P.Undergrad" : "UndergradP", "PhD" : "PhDpct", "Terminal" : "TerPct", "S.F.Ratio" : "SFratio", "Grad.Rate" : "GradRate", "Private_Yes" : "Private", "Room.Board" : "Housing", "perc.alumni" : "AlumniPct"})

colleges.head(5)


colleges_z = (colleges - colleges.mean())/colleges.std() # convert to z-scores
   
colleges_z.plot(kind='box', figsize=(20,5))

<AxesSubplot:>


def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)

outliers = colleges.transform(is_outlier)

colleges_reg = colleges[~outliers] # non-outlier data

colleges_reg.dropna(inplace = True) # outliers flagged with NaN

colleges_reg.head(5)


org_num_colleges = len(colleges['Private'])
reg_num_colleges = len(colleges_reg['Private'])

print('Original number of colleges: ' + str(org_num_colleges) )
print('Number of outlier colleges: ' + str(org_num_colleges - reg_num_colleges) )
print('Number of remaining colleges: ' + str(reg_num_colleges) )

Original number of colleges: 777
Number of outlier colleges: 238
Number of remaining colleges: 539


non_financial = ['Apps', 'Accept', 'Enroll', 'Top10pct', 'Top25pct', 'UndergradF', 'UndergradP', 'PhDpct', 'TerPct', 'SFratio', 'GradRate', 'Private']
financial = ['Housing', 'Books', 'Personal', 'AlumniPct']

X_nf = colleges_reg[non_financial]
X_f = colleges_reg[financial]

X_nf.head(5)


y = colleges_reg["Outstate"] # out of state tuition
y.head(5)

Abilene Christian University     7440.0
Adrian College                  11250.0
Albertson College               13500.0
Albertus Magnus College         13290.0
Albion College                  13868.0
Name: Outstate, dtype: float64


X_train, X_test, y_train, y_test = train_test_split(X_nf, y, test_size=0.2, random_state=2132)
X_train.head(5)


y_train.head(5)

Northeast Missouri State University     4856.0
Wisconsin Lutheran College              9100.0
Connecticut College                    18740.0
Centenary College                      11400.0
Trinity University                     12240.0
Name: Outstate, dtype: float64


# Student Body Size
cols = ["Apps", "Accept", "Enroll"]
sns.pairplot(data=X_nf[cols], height=2);

pw_spearman = X_nf[cols].corr(method = 'spearman') # compute pairwise spearman correlation
# Remove diagonals
for i in range(len(cols)):
    for j in range(len(cols)):
        if i==j:
            pw_spearman.iloc[i,j] = None # remove value
            
pw_spearman


colleges_reg['AccRate'] = 100* colleges_reg['Accept'] / colleges_reg['Apps'] # acceptance rate as percentage


# High School Demographics
sns.relplot(data=colleges_reg, 
    x="Top10pct", y="Top25pct"
    );
print("Spearman correlation: " + str(X_nf["Top10pct"].corr(X_nf["Top25pct"],"spearman")) )

Spearman correlation: 0.9024959609998393


# Small School Factor

undergrad_spr = X_nf["UndergradP"].corr(X_nf["UndergradF"], "spearman")
print("Undergraduate distribution spearman correlation: " + str(undergrad_spr))

# compute undergrad population
X_nf_temp = X_nf # copy data
undergrads = X_nf_temp["UndergradP"] + X_nf_temp["UndergradF"]
X_nf_temp["Undergrads"] = pd.Series(undergrads)

cols = ["Undergrads","PhDpct", "TerPct", "SFratio"]

sns.pairplot(data=X_nf_temp[cols], height=2);

pw_spearman = X_nf_temp[cols].corr(method = 'spearman') # compute pairwise spearman correlation
# Remove diagonals
for i in range(len(cols)):
    for j in range(len(cols)):
        if i==j:
            pw_spearman.iloc[i,j] = None # remove value
            
X_nf = X_nf.drop(columns = ["Undergrads"]) # remove temporary column
            
pw_spearman

Undergraduate distribution spearman correlation: 0.4018358685729641

C:\Users\andre\AppData\Local\Temp\ipykernel_9644\1357207466.py:9: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_nf_temp["Undergrads"] = pd.Series(undergrads)


X = colleges_reg[['AccRate', 'Enroll', 'Top10pct', 'UndergradF', 'UndergradP', 'PhDpct', 'TerPct', 'SFratio', 'GradRate', 'Private']]

X_uc_train, X_uc_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2132) # simplified data
X_nf_train, X_nf_test, y_train, y_test = train_test_split(X_nf, y, test_size=0.2, random_state=2132) # full data

X_uc_train.head(5)


# Linear Model
lm_nf = LinearRegression()
lm_red = LinearRegression()
pipe_nf = make_pipeline(StandardScaler(), lm_nf) # make a pipeline to standardize columns 
pipe_red = make_pipeline(StandardScaler(), lm_red)

# Full data set
pipe_nf.fit(X_nf_train, y_train) # fit to training data
lin_score_nf = pipe_nf.score(X_nf_test, y_test) # score on testing data
print("Linear R^2 score, full feature set:", f"{lin_score_nf:.5f}")

# Uncorrelated data set
pipe_red.fit(X_uc_train, y_train) # fit to training data
lin_score_unc = pipe_red.score(X_uc_test, y_test) # score on testing data
print("Linear R^2 score, uncorrelated feature set:", f"{lin_score_unc:.5f}")

pct_red = (lin_score_nf - lin_score_unc) / lin_score_nf
print("Score reduction: " + f"{(pct_red*100):0.3f}" + "%")

Linear R^2 score, full feature set: 0.68001
Linear R^2 score, uncorrelated feature set: 0.63242
Score reduction: 6.998%


data_lin = pd.DataFrame( {
    "feature": X_nf.columns,
    "Linear": pipe_nf[1].coef_
    } ) # weight vector

data_lin_sorted = data_lin.sort_values(by="Linear") #sort data by weight score
# compute z-scores
data_lin_sorted["Linear_z"] = (data_lin_sorted["Linear"] - data_lin_sorted["Linear"].mean())/data_lin_sorted["Linear"].std()

data_lin_sorted.reset_index(drop=True, inplace = True) # reset numerical indices

data_lin_sorted


print('Highest pos correlated: ' + str( data_lin_sorted['feature'][11] ) )
print('Highest neg correlated: ' + str( data_lin_sorted['feature'][0] ) )

Highest pos correlated: Accept
Highest neg correlated: Enroll


kf = KFold(n_splits=4, shuffle=True, random_state=302)
alpha = np.linspace(0, 10, 50)[1:]  # exclude alpha=0

_,scores = validation_curve(
    Lasso(),
    X_nf_train, y_train,
    cv=kf,
    n_jobs=-1,
    param_name="alpha", param_range=alpha
    )

fig = sns.lineplot(x=alpha, y=np.mean(scores, axis=1) );
fig.set(xlabel="alpha", ylabel="R^2 score");


best_alp = alpha[np.mean(scores,axis=1).argmax()] # get best alpha value from plot before

lass = Lasso(alpha=best_alp)
pipe_lass = make_pipeline(StandardScaler(), lass) # make pipeline
pipe_lass.fit(X_nf_train, y_train) # fit best lasso curve

data_las = pd.DataFrame( {
    "feature": X_nf.columns,
    "LASSO": pipe_lass[1].coef_
    } )

lass_score_1 = pipe_lass.score(X_nf_test, y_test)
print(f"Best coefficient of determination: ", f"{lass_score_1 :.5f}")

data_las = data_las.sort_values(by="LASSO") # sort values
data_las.reset_index(drop=True, inplace = True) # reset numerical indices

data_las

Best coefficient of determination:  0.67999


print('Highest correlated: ' + str( data_las["feature"][11] ) )
print('Lowest correlated: ' + str( data_las["feature"][0] ) )

Highest correlated: Accept
Lowest correlated: Enroll


wt_sum = sum(abs(data_las["LASSO"])) # sum of all weights

data_las["Lz%"] = 100* abs(data_las["LASSO"]) / wt_sum # create Lz score

data_las.sort_values(by="Lz%", inplace = True) # sort by Lz score
data_las.reset_index(drop=True, inplace = True) # reset numerical indices

data_las


non_financial_2 = list(data_las["feature"][2:]) # reduced set of features
print("Features used in analysis: ")
print(non_financial_2)

Features used in analysis: 
['PhDpct', 'UndergradF', 'Top10pct', 'GradRate', 'SFratio', 'TerPct', 'Apps', 'Private', 'Enroll', 'Accept']


X_nf_2 = colleges_reg[non_financial_2] # redefine feature matrix
X_train, X_test, y_train, y_test = train_test_split(X_nf_2, y, test_size=0.2, random_state=2132) # X denotes reduced set

# Linear Model
lm_red_2 = LinearRegression()
pipe_lin_red = make_pipeline(StandardScaler(), lm_red_2)
pipe_lin_red.fit(X_train, y_train) # fit to training data
lin_score_red = pipe_lin_red.score(X_test, y_test) # score on testing data
print("Linear R^2 score:", f"{lin_score_red:.5f}")

Linear R^2 score: 0.68104


kf = KFold(n_splits=4, shuffle=True, random_state=302)
alpha = np.linspace(0, 10, 50)[1:]  # exclude alpha=0

_,scores = validation_curve(
    Lasso(),
    X_train, y_train,
    cv=kf,
    n_jobs=-1,
    param_name="alpha", param_range=alpha
    )

fig = sns.lineplot(x=alpha, y=np.mean(scores, axis=1) );
fig.set(xlabel="alpha", ylabel="R^2 score");


best_alp = alpha[np.mean(scores,axis=1).argmax()] # get best alpha value from plot before

lass_2 = Lasso(alpha=best_alp)
pipe_lass_2 = make_pipeline(StandardScaler(), lass_2)
pipe_lass_2.fit(X_train, y_train) # fit best lasso curve

lass_score_red = pipe_lass_2.score(X_test, y_test)
print(f"Best coefficient of determination: ", f"{lass_score_red:.5f}")

Best coefficient of determination:  0.68105


X_train.head(5)


# Ridge Regression
alphas = [] # alpha values measured
r2_scores = [] # associated scores
wt_norms = [] # euclidean norm of feature weights
 
for a in range(1, 200, 1): # test 300 values, skip alpha=0
    alp = a * (5/200) # alpha ranges from 0 to 5
    rr_1 = Ridge(alpha=alp)    # more regularization
    pipe_rr_1 = make_pipeline(StandardScaler(), rr_1)
    pipe_rr_1.fit(X_train, y_train)
    alphas.append(alp) # add to alpha list
    r2_scores.append(pipe_rr_1.score(X_test, y_test)) # add score
    wt_norms.append(norm(pipe_rr_1[1].coef_)) # add norm
    
ridge_r2_1 = max(r2_scores[1:]) 
print("Reduced Ridge R^2 score:" , f"{ridge_r2_1:.5f}") # exclude first value (alpha=0)
print("Best alpha value: ", f"{alphas[r2_scores.index(ridge_r2_1)]:0.3f}") 

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(12, 5))
fig.tight_layout()

# R^2 plot
plt.subplot(1,2,1)    
plt.plot(alphas, r2_scores)
plt.xlabel('alpha')
plt.ylabel('R^2')

# 2-norm plot
plt.subplot(1,2,2)
plt.plot(alphas, wt_norms)
plt.xlabel('alpha')
plt.ylabel('2-norm')

plt.show()

Reduced Ridge R^2 score: 0.68157
Best alpha value:  3.075


# Ridge Regression
alphas = [] # alpha values measured
r2_scores = [] # associated scores
wt_norms = [] # euclidean norm of feature weights
 
for a in range(1, 200, 1): # test 100 values
    alp = a * (5/200) # alpha ranges from 0 to 5
    rr_2 = Ridge(alpha=alp)    # more regularization
    pipe_rr_2 = make_pipeline(StandardScaler(), rr_2) # use pipeline
    pipe_rr_2.fit(X_nf_train, y_train)
    alphas.append(alp) # add to alpha list
    r2_scores.append(pipe_rr_2.score(X_nf_test, y_test)) # add score
    wt_norms.append(norm(pipe_rr_2[1].coef_)) # add norm
    
ridge_r2_2 = max(r2_scores[1:]) 
print("Non-Reduced Ridge R^2 score:" , f"{ridge_r2_2:.5f}") # exclude first value (alpha=0)
print("Best alpha value: ", f"{alphas[r2_scores.index(ridge_r2_2)]:0.3f}")

Non-Reduced Ridge R^2 score: 0.68055
Best alpha value:  2.950


print("Percent increase of Ridge R^2: ", f"{100*(ridge_r2_2-ridge_r2_1)/ridge_r2_1:0.5f}")

Percent increase of Ridge R^2:  -0.15061


# Linear
lin_wts = pd.DataFrame( {
    "feature": X_train.columns,
    "Linear": pipe_lin_red[1].coef_
    } ) # linear weights, reduced feature set
lin_wts.set_index('feature', inplace=True) # index by feature

# LASSO
lass_wts = pd.DataFrame( {
    "feature": X_train.columns,
    "LASSO": pipe_lass_2[1].coef_
    } ) # lasso weights, reduced feature set
lass_wts.set_index('feature', inplace=True) # index by feature

# Ridge
rr_wts = pd.DataFrame( {
    "feature": X_train.columns,
    "Ridge": pipe_rr_1[1].coef_
    } ) # ridge weights, reduced feature set
rr_wts.set_index('feature', inplace=True) # index by feature

# Merge weight values, all indices are in same order by default
frame = {"Linear" : lin_wts["Linear"], "LASSO" : lass_wts["LASSO"], "Ridge" : rr_wts["Ridge"]}
all_wts = pd.DataFrame(frame)
all_wts.sort_values(by="Ridge", ascending=False, inplace=True) # sort by ridge score

# Gather scores, append to dataframe
all_scores = {"Linear" : [lin_score_red], "LASSO" : [lass_score_red], "Ridge" : [ridge_r2_1]}
scores = pd.DataFrame(data=all_scores, index=["R^2"])

# add empty row of data
all_wts = pd.concat([all_wts, pd.DataFrame(data={"Linear" : [' '], "LASSO" : [' '], "Ridge" : [' ']},index=[' '])], ignore_index = False)
# add R^2 scores
all_wts = pd.concat([all_wts, scores], ignore_index = False)
all_wts


# Uniform metric scores
scores_u = []
# Distance metric scores
scores_d1 = [] # manhatten distance
scores_d2 = [] # euclidean distance

k_vals = [] # values of k to search through
k_max = 30 # maximum k value to search for

kf = KFold(n_splits=6, shuffle=True, random_state=1995) # use 6 folds

# Uniform
for k in range(1,k_max,1): # range from k=1 to k=20
    k_vals.append(k) # add k to measurement data
    kNN = KNeighborsRegressor(n_neighbors=k, weights='uniform')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_u.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k=20
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='manhattan')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d1.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k=20
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='euclidean')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d2.append(kNN_pipe.score(X_test, y_test))
                    
data = {"k" : k_vals, "uniform" : scores_u, "manhattan" : scores_d1, "euclidean" : scores_d2}
r2_scores = pd.DataFrame(data=data, index=k_vals)
r2_scores.set_index("k", inplace=True) # index by k

sns.lineplot(data=r2_scores[['uniform', 'manhattan', 'euclidean']], 
             palette = ['red', 'green', 'blue'], 
             linestyle = 'solid')

<AxesSubplot:xlabel='k'>


# Uniform
best_u_r2 = r2_scores['uniform'].max()
best_u_k = k_vals[r2_scores['uniform'].argmax()]

# Manhattan
best_d1_r2 = r2_scores['manhattan'].max()
best_d1_k = k_vals[r2_scores['manhattan'].argmax()]

# Euclidean
best_d2_r2 = r2_scores['euclidean'].max()
best_d2_k = k_vals[r2_scores['euclidean'].argmax()]

mets = ['uniform', 'manhattan', 'euclidean'] # metrics used
vals = [best_u_r2, best_d1_r2, best_d2_r2] # list of best scores
ks = [best_u_k, best_d1_k, best_d2_k] # list of best associated k values

best_r2_val = max(vals)
best_r2_m = mets[np.argmax(vals)]
best_r2_k = ks[np.argmax(vals)]

print("Best kNN computed using " + str(best_r2_m) + " metric with k=" + str(best_r2_k) )
print("kNN R^2 score = " + str(best_r2_val))

Best kNN computed using euclidean metric with k=9
kNN R^2 score = 0.6972190555157138


data = {"model" : ["Linear", "Linear", "LASSO", "Linear", "LASSO", "Ridge", "Ridge", "kNN"], 
        "set" : ["Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial"], 
        "features" : ["Full", "Uncorrelated", "Full", "Reduced", "Reduced", "Reduced", "Full", "Reduced"],
        "R^2": [lin_score_nf, lin_score_unc, lass_score_1, lin_score_red, lass_score_red, ridge_r2_1, ridge_r2_2, best_r2_val]}

r2_scores = pd.DataFrame(data=data) # make dataframe
r2_scores.sort_values(by="R^2", ascending=False, inplace=True) # sort by score
r2_scores.reset_index(drop=True, inplace = True) # reset numerical indices

r2_scores # display data


feature_weights = all_wts.iloc[:10] # remove R^2 scores
feature_weights = feature_weights.apply(lambda x: 100*abs(x)/abs(x).sum()) # compute Lz score
feature_weights.sort_values(by="Ridge", ascending=False, inplace=True) # sort by ridge features

print("Feature weight distribution by percentage to overall weight magnitude (Lz value): ")

feature_weights

Feature weight distribution by percentage to overall weight magnitude (Lz value):


# Actual Value
print("University of Delaware")
k_act = colleges_raw["Outstate"]["University of Delaware"]
print("Actual tuition cost: $", f"{k_act:0.2f}" )

# Preducted Value
# train kNN using best model
kNN = KNeighborsRegressor(n_neighbors=9, weights='distance', metric='euclidean')
kNN_pipe = make_pipeline(StandardScaler(), kNN)
kNN_pipe.fit(X_train, y_train)

# get query data
features = X_train.columns
udel = colleges.loc[["University of Delaware"]] # get row of data
X_query = udel[features]

khat = kNN_pipe.predict(X_query)
print("Predicted tuition cost: $", f"{khat[0]:0.2f}" )
err = 100*(khat[0] - k_act) / k_act
print("Percent error:", f"{err:0.2f}", "%" )

University of Delaware
Actual tuition cost: $ 10220.00
Predicted tuition cost: $ 9727.27
Percent error: -4.82 %


X_train, X_test, y_train, y_test = train_test_split(X_f, y, test_size=0.2, random_state=2132) #financial set of features
X_train.head(5)


# Uniform metric scores
scores_u = []
# Distance metric scores
scores_d1 = [] # manhatten distance
scores_d2 = [] # euclidean distance

k_vals = [] # values of k to search through
k_max = 50 # maximum k value to search for

kf = KFold(n_splits=6, shuffle=True, random_state=1995) # use 6 folds

# Uniform
for k in range(1,k_max,1): # range from k=1 to k_max
    k_vals.append(k) # add k to measurement data
    kNN = KNeighborsRegressor(n_neighbors=k, weights='uniform')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_u.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k_max
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='manhattan')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d1.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k_max
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='euclidean')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d2.append(kNN_pipe.score(X_test, y_test))
                    
data = {"k" : k_vals, "uniform" : scores_u, "manhattan" : scores_d1, "euclidean" : scores_d2}
r2_scores = pd.DataFrame(data=data, index=k_vals)
r2_scores.set_index("k", inplace=True) # index by k

sns.lineplot(data=r2_scores[['uniform', 'manhattan', 'euclidean']], 
             palette = ['red', 'green', 'blue'], 
             linestyle = 'solid')

<AxesSubplot:xlabel='k'>


# Uniform
best_u_r2 = r2_scores['uniform'].max()
best_u_k = k_vals[r2_scores['uniform'].argmax()]

# Manhattan
best_d1_r2 = r2_scores['manhattan'].max()
best_d1_k = k_vals[r2_scores['manhattan'].argmax()]

# Euclidean
best_d2_r2 = r2_scores['euclidean'].max()
best_d2_k = k_vals[r2_scores['euclidean'].argmax()]

mets = ['uniform', 'manhattan', 'euclidean'] # metrics used
vals = [best_u_r2, best_d1_r2, best_d2_r2] # list of best scores
ks = [best_u_k, best_d1_k, best_d2_k] # list of best associated k values

best_r2_val_f = max(vals)
best_r2_m = mets[np.argmax(vals)]
best_r2_k = ks[np.argmax(vals)]

print("Best kNN computed using " + str(best_r2_m) + " metric with k=" + str(best_r2_k) )
print("kNN R^2 score = " + str(best_r2_val_f))

Best kNN computed using uniform metric with k=15
kNN R^2 score = 0.43693840390467686


# Redefine datasets
X = colleges_reg[financial + non_financial]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2132) #financial set of features

# Uniform metric scores
scores_u = []
# Distance metric scores
scores_d1 = [] # manhatten distance
scores_d2 = [] # euclidean distance

k_vals = [] # values of k to search through
k_max = 30 # maximum k value to search for

kf = KFold(n_splits=6, shuffle=True, random_state=1995) # use 6 folds

# Uniform
for k in range(1,k_max,1): # range from k=1 to k_max
    k_vals.append(k) # add k to measurement data
    kNN = KNeighborsRegressor(n_neighbors=k, weights='uniform')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_u.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k_max
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='manhattan')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d1.append(kNN_pipe.score(X_test, y_test))

# Distance: Manhattan
for k in range(1,k_max,1): # range from k=1 to k_max
    kNN = KNeighborsRegressor(n_neighbors=k, weights='distance', metric='euclidean')
    kNN_pipe = make_pipeline(StandardScaler(), kNN)
    kNN_pipe.fit(X_train, y_train)
    scores_d2.append(kNN_pipe.score(X_test, y_test))
                    
data = {"k" : k_vals, "uniform" : scores_u, "manhattan" : scores_d1, "euclidean" : scores_d2}
r2_scores = pd.DataFrame(data=data, index=k_vals)
r2_scores.set_index("k", inplace=True) # index by k

sns.lineplot(data=r2_scores[['uniform', 'manhattan', 'euclidean']], 
             palette = ['red', 'green', 'blue'], 
             linestyle = 'solid')

<AxesSubplot:xlabel='k'>


# Uniform
best_u_r2 = r2_scores['uniform'].max()
best_u_k = k_vals[r2_scores['uniform'].argmax()]

# Manhattan
best_d1_r2 = r2_scores['manhattan'].max()
best_d1_k = k_vals[r2_scores['manhattan'].argmax()]

# Euclidean
best_d2_r2 = r2_scores['euclidean'].max()
best_d2_k = k_vals[r2_scores['euclidean'].argmax()]

mets = ['uniform', 'manhattan', 'euclidean'] # metrics used
vals = [best_u_r2, best_d1_r2, best_d2_r2] # list of best scores
ks = [best_u_k, best_d1_k, best_d2_k] # list of best associated k values

best_r2_val_full = max(vals)
best_r2_m = mets[np.argmax(vals)]
best_r2_k = ks[np.argmax(vals)]

print("Best kNN computed using " + str(best_r2_m) + " metric with k=" + str(best_r2_k) )
print("kNN R^2 score = " + str(best_r2_val_full))

Best kNN computed using euclidean metric with k=13
kNN R^2 score = 0.7039428277962443


data = {"model" : ["Linear", "Linear", "LASSO", "Linear", "LASSO", "Ridge", "Ridge", "kNN", "kNN", "kNN"], 
        "set" : ["Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Non-financial", "Financial", "Full"], 
        "features" : ["Full", "Uncorrelated", "Full", "Reduced", "Reduced", "Reduced", "Full", "Reduced", "Full", "Full"],
        "R^2": [lin_score_nf, lin_score_unc, lass_score_1, lin_score_red, lass_score_red, ridge_r2_1, ridge_r2_2, best_r2_val, best_r2_val_f, best_r2_val_full]}

r2_scores = pd.DataFrame(data=data) # make dataframe
r2_scores.sort_values(by="R^2", ascending=False, inplace=True) # sort by score
r2_scores.reset_index(drop=True, inplace = True) # reset numerical indices

r2_scores # display data

	Private	Apps	Accept	Enroll	Top10perc	Top25perc	F.Undergrad	P.Undergrad	Outstate	Room.Board	Books	Personal	PhD	Terminal	S.F.Ratio	perc.alumni	Expend	Grad.Rate
Abilene Christian University	Yes	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60
Adelphi University	Yes	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56
Adrian College	Yes	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54
Agnes Scott College	Yes	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59
Alaska Pacific University	Yes	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15

	Apps	Accept	Enroll	Top10pct	Top25pct	UndergradF	UndergradP	Outstate	Housing	Books	Personal	PhDpct	TerPct	SFratio	AlumniPct	Expend	GradRate	Private
Abilene Christian University	1660	1232	721	23	52	2885	537	7440	3300	450	2200	70	78	18.1	12	7041	60	1
Adelphi University	2186	1924	512	16	29	2683	1227	12280	6450	750	1500	29	30	12.2	16	10527	56	1
Adrian College	1428	1097	336	22	50	1036	99	11250	3750	400	1165	53	66	12.9	30	8735	54	1
Agnes Scott College	417	349	137	60	89	510	63	12960	5450	450	875	92	97	7.7	37	19016	59	1
Alaska Pacific University	193	146	55	16	44	249	869	7560	4120	800	1500	76	72	11.9	2	10922	15	1

	Apps	Accept	Enroll	Top10pct	Top25pct	UndergradF	UndergradP	Outstate	Housing	Books	Personal	PhDpct	TerPct	SFratio	AlumniPct	Expend	GradRate	Private
Abilene Christian University	1660.0	1232.0	721.0	23.0	52	2885.0	537.0	7440.0	3300.0	450.0	2200.0	70.0	78.0	18.1	12.0	7041.0	60.0	1
Adrian College	1428.0	1097.0	336.0	22.0	50	1036.0	99.0	11250.0	3750.0	400.0	1165.0	53.0	66.0	12.9	30.0	8735.0	54.0	1
Albertson College	587.0	479.0	158.0	38.0	62	678.0	41.0	13500.0	3335.0	500.0	675.0	67.0	73.0	9.4	11.0	9727.0	55.0	1
Albertus Magnus College	353.0	340.0	103.0	17.0	45	416.0	230.0	13290.0	5720.0	500.0	1500.0	90.0	93.0	11.5	26.0	8861.0	63.0	1
Albion College	1899.0	1720.0	489.0	37.0	68	1594.0	32.0	13868.0	4826.0	450.0	850.0	89.0	100.0	13.7	37.0	11487.0	73.0	1

	Apps	Accept	Enroll	Top10pct	Top25pct	UndergradF	UndergradP	PhDpct	TerPct	SFratio	GradRate	Private
Abilene Christian University	1660.0	1232.0	721.0	23.0	52	2885.0	537.0	70.0	78.0	18.1	60.0	1
Adrian College	1428.0	1097.0	336.0	22.0	50	1036.0	99.0	53.0	66.0	12.9	54.0	1
Albertson College	587.0	479.0	158.0	38.0	62	678.0	41.0	67.0	73.0	9.4	55.0	1
Albertus Magnus College	353.0	340.0	103.0	17.0	45	416.0	230.0	90.0	93.0	11.5	63.0	1
Albion College	1899.0	1720.0	489.0	37.0	68	1594.0	32.0	89.0	100.0	13.7	73.0	1

	Apps	Accept	Enroll	Top10pct	Top25pct	UndergradF	UndergradP	PhDpct	TerPct	SFratio	GradRate	Private
Northeast Missouri State University	6040.0	4577.0	1620.0	36.0	72	5640.0	266.0	69.0	72.0	15.7	76.0	0
Wisconsin Lutheran College	152.0	128.0	75.0	17.0	41	282.0	22.0	48.0	48.0	8.5	50.0	1
Connecticut College	3035.0	1546.0	438.0	42.0	93	1630.0	232.0	86.0	95.0	10.7	91.0	1
Centenary College	369.0	312.0	90.0	12.0	46	396.0	526.0	41.0	85.0	9.5	24.0	1
Trinity University	2425.0	1818.0	601.0	62.0	93	2110.0	95.0	94.0	96.0	9.6	93.0	1

Prediction of College Tuition Rates Using Machine Learning Regression Models and Non-Financial Data

Andrew Smith, May 2nd, 2023

Introduction¶

Pre-Processing¶

Feature Correlation¶

Results¶

Linear Regression¶

LASSO Regression¶

Ridge Regression¶

Non-Linear Regression¶

Discussion¶

Regression Summary¶

Financial Features¶

References¶

	Apps	Accept	Enroll
Apps	NaN	0.980951	0.888869
Accept	0.980951	NaN	0.911546
Enroll	0.888869	0.911546	NaN

	Undergrads	PhDpct	TerPct	SFratio
Undergrads	NaN	0.215165	0.182446	0.462291
PhDpct	0.215165	NaN	0.848562	-0.070788
TerPct	0.182446	0.848562	NaN	-0.133848
SFratio	0.462291	-0.070788	-0.133848	NaN

	AccRate	Enroll	Top10pct	UndergradF	UndergradP	PhDpct	TerPct	SFratio	GradRate	Private
Northeast Missouri State University	75.778146	1620.0	36.0	5640.0	266.0	69.0	72.0	15.7	76.0	0
Wisconsin Lutheran College	84.210526	75.0	17.0	282.0	22.0	48.0	48.0	8.5	50.0	1
Connecticut College	50.939044	438.0	42.0	1630.0	232.0	86.0	95.0	10.7	91.0	1
Centenary College	84.552846	90.0	12.0	396.0	526.0	41.0	85.0	9.5	24.0	1
Trinity University	74.969072	601.0	62.0	2110.0	95.0	94.0	96.0	9.6	93.0	1

	feature	Linear	Linear_z
0	Enroll	-1506.717952	-1.579336
1	Apps	-786.863348	-0.923764
2	SFratio	-719.506269	-0.862422
3	UndergradF	-292.914322	-0.473924
4	Top25pct	-34.051891	-0.238178
5	UndergradP	62.373969	-0.150363
6	PhDpct	211.419077	-0.014627
7	Top10pct	613.126309	0.351208
8	GradRate	619.775148	0.357263
9	TerPct	732.828823	0.460221
10	Private	990.837763	0.695190
11	Accept	2839.456200	2.378729

	feature	LASSO
0	Enroll	-1503.929429
1	Apps	-781.725753
2	SFratio	-719.425070
3	UndergradF	-293.279955
4	Top25pct	-32.550627
5	UndergradP	61.888626
6	PhDpct	211.405658
7	Top10pct	611.267706
8	GradRate	619.595481
9	TerPct	732.874996
10	Private	991.518574
11	Accept	2832.596764

	feature	LASSO	Lz%
0	Top25pct	-32.550627	0.346576
1	UndergradP	61.888626	0.658946
2	PhDpct	211.405658	2.250898
3	UndergradF	-293.279955	3.122638
4	Top10pct	611.267706	6.508346
5	GradRate	619.595481	6.597015
6	SFratio	-719.425070	7.659929
7	TerPct	732.874996	7.803135
8	Apps	-781.725753	8.323263
9	Private	991.518574	10.556989
10	Enroll	-1503.929429	16.012777
11	Accept	2832.596764	30.159488

	Linear	LASSO	Ridge
Accept	2859.101456	2851.748094	2248.940443
Private	991.525513	992.21873	1032.91443
TerPct	732.412857	732.563779	735.306665
GradRate	616.950962	616.856767	619.146294
Top10pct	573.273606	572.780274	531.294577
PhDpct	202.599214	202.63151	231.717265
UndergradF	-242.083707	-242.688536	-377.021603
Apps	-795.436993	-790.005652	-377.653957
SFratio	-716.367382	-716.320984	-716.131031
Enroll	-1537.566033	-1534.489051	-1190.533883

R^2	0.681041	0.681053	0.681572

	model	set	features	R^2
0	kNN	Non-financial	Reduced	0.697219
1	Ridge	Non-financial	Reduced	0.681572
2	LASSO	Non-financial	Reduced	0.681053
3	Linear	Non-financial	Reduced	0.681041
4	Ridge	Non-financial	Full	0.680546
5	Linear	Non-financial	Full	0.680006
6	LASSO	Non-financial	Full	0.679995
7	Linear	Non-financial	Uncorrelated	0.632418

	Linear	LASSO	Ridge
Accept	30.851445	30.822034	27.900202
Enroll	16.591274	16.584941	14.769682
Private	10.699164	10.724019	12.814266
TerPct	7.903181	7.917637	9.122164
SFratio	7.73004	7.742083	8.884273
GradRate	6.657276	6.667062	7.681087
Top10pct	6.185971	6.190678	6.591204
Apps	8.583249	8.538475	4.685149
UndergradF	2.612231	2.623007	4.677304
PhDpct	2.186169	2.190066	2.874669

	Housing	Books	Personal	AlumniPct
Northeast Missouri State University	3416.0	400.0	1100.0	13.0
Wisconsin Lutheran College	3700.0	500.0	1400.0	26.0
Connecticut College	6300.0	600.0	500.0	40.0
Centenary College	5400.0	500.0	760.0	20.0
Trinity University	5150.0	500.0	490.0	20.0

	model	set	features	R^2
0	kNN	Full	Full	0.703943
1	kNN	Non-financial	Reduced	0.697219
2	Ridge	Non-financial	Reduced	0.681572
3	LASSO	Non-financial	Reduced	0.681053
4	Linear	Non-financial	Reduced	0.681041
5	Ridge	Non-financial	Full	0.680546
6	Linear	Non-financial	Full	0.680006
7	LASSO	Non-financial	Full	0.679995
8	Linear	Non-financial	Uncorrelated	0.632418
9	kNN	Financial	Full	0.436938