import numpy as np
from numpy.random import default_rng
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix, f1_score, balanced_accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import precision_score
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.model_selection import GridSearchCV


weather_raw = pd.read_csv('https://raw.githubusercontent.com/zonination/weather-us/master/nyc.csv')
weather_raw


accidents_raw = pd.read_csv('https://query.data.world/s/wyhfcc4evd3fydwgw3tdp2synck42u?dws=00000')
accidents_raw

C:\Users\andre\AppData\Local\Temp\ipykernel_21300\1668878680.py:1: DtypeWarning: Columns (3) have mixed types. Specify dtype option on import or set low_memory=False.
  accidents_raw = pd.read_csv('https://query.data.world/s/wyhfcc4evd3fydwgw3tdp2synck42u?dws=00000')


print("Weather Data")
weather_dates = pd.to_datetime(weather_raw["Date"]) # convert all listed dates as datetime values
print("Start Date: " + str(weather_dates.min()) )
print("End Date: " + str(weather_dates.max()) )

print('')

print("Collision Data")
accident_dates = pd.to_datetime(accidents_raw["CRASH DATE"], errors = "coerce") # force non-date entries to be NaT
print("Start Date: " + str(accident_dates.min()) )
print("End Date: " + str(accident_dates.max()) )

Weather Data
Start Date: 1948-07-01 00:00:00
End Date: 2015-12-31 00:00:00

Collision Data
Start Date: 2012-07-01 00:00:00
End Date: 2020-01-11 00:00:00


is_past_start = (weather_dates >= '2012-10-12') # boolean expressions to check if within date range
is_before_end = (weather_dates <= '2015-12-31')

weather2 = weather_raw[is_past_start & is_before_end]

weather2.head(5)


wx_cols = pd.Series(weather_raw.columns)
used_features = [1,  3,9,12,18,20,21,22] # indices of features/columns used
wx_features = wx_cols[used_features]

print("All available weather features for analysis:")
print(wx_cols[2:-2]) 

print("")
print("Simplified weather data used:")
print(wx_features[1:])

All available weather features for analysis:
2              Max.TemperatureF
3             Mean.TemperatureF
4              Min.TemperatureF
5                Max.Dew.PointF
6                MeanDew.PointF
7                 Min.DewpointF
8                  Max.Humidity
9                 Mean.Humidity
10                 Min.Humidity
11     Max.Sea.Level.PressureIn
12    Mean.Sea.Level.PressureIn
13     Min.Sea.Level.PressureIn
14          Max.VisibilityMiles
15         Mean.VisibilityMiles
16          Min.VisibilityMiles
17            Max.Wind.SpeedMPH
18           Mean.Wind.SpeedMPH
19            Max.Gust.SpeedMPH
20              PrecipitationIn
21                   CloudCover
22                       Events
23         WindDirDegrees.br...
dtype: object

Simplified weather data used:
3             Mean.TemperatureF
9                 Mean.Humidity
12    Mean.Sea.Level.PressureIn
18           Mean.Wind.SpeedMPH
20              PrecipitationIn
21                   CloudCover
22                       Events
dtype: object


X2 = weather2[wx_features]

pd.to_datetime(X2["Date"]) # convert date to proper format if not already
X3 = X2.set_index("Date")

X3.head()


print(X3["Events"].unique()) # all possible weather combinations
event_type = ["Rain", "Snow", "Fog", "Thunderstorm"] # unique weather events

[nan 'Rain' 'Fog-Rain' 'Fog' 'Fog-Rain-Snow' 'Snow' 'Rain-Thunderstorm'
 'Rain-Snow' 'Fog-Snow' 'Fog-Rain-Thunderstorm' 'Thunderstorm'
 'Fog-Thunderstorm' 'Rain-Snow-Thunderstorm']


# boolean search if weather type is present that day
rain = []
snow = []
fog = []
tstorm = []
for i in range(len(X3["Events"])): #loop over each day
    if 'Rain' in str(X3["Events"][i]): # rain present
        rain.append(1)
    else:
        rain.append(0)
    if 'Snow' in str(X3["Events"][i]): # snow present
        snow.append(1)
    else:
        snow.append(0)
    if "Fog" in str(X3["Events"][i]): # fog present
        fog.append(1)
    else:
        fog.append(0)
    if "Thunderstorm" in str(X3["Events"][i]): # thunderstorm present
        tstorm.append(1)
    else:
        tstorm.append(0)

# add dummy features to dataframe
X3["Is_Rain"] = rain
X3["Is_Snow"] = snow
X3["Is_Fog"] = fog
X3["Is_Storm"] = tstorm
X4 = X3.drop(columns="Events") # event feature is obsolete now

X4 = X4.rename(columns={"Mean.TemperatureF" : "AvgTempF", "Mean.Humidity" : "AvgHumidity", "Mean.Sea.Level.PressureIn" : "AvgPressureIn", "Mean.Wind.SpeedMPH" : "AvgWindSpdMPH", "PrecipitationIn" : "PrecipIn"}) # rename columns to shorten text
X4.head()


X4["PrecipIn"] = X4["PrecipIn"].replace("T", 0) # replace trace amounts of precipitation
X4 = X4.astype("float") # convert stray values to floating point values
X = X4

X


daily_count = pd.Series(accident_dates.value_counts()) # count occurences of each date & collision entry pair

daily_count = daily_count.to_frame()
daily_count2 = daily_count.reset_index(inplace=False) # have date as seperate column
daily_count2.columns = ["Date", "No. of Collisions"]

daily_count3 = daily_count2.sort_values(by = "Date") # sort dataframe by date

daily_count3


accident_dates_2 = daily_count3["Date"] # extract dates from list before

is_past_start = (accident_dates_2 >= '2012-10-12') # boolean expressions to check if within date range
is_before_end = (accident_dates_2 <= '2015-12-31')

collisions2 = daily_count3[is_past_start & is_before_end]

collisions3 = collisions2.set_index("Date") # convert date column back to index

collisions3


Xy = X # copy feature frame
Xy["Collisions"] = collisions3 # append collisions column 
Xy


X = Xy[["AvgTempF", "AvgHumidity", "AvgPressureIn", "AvgWindSpdMPH", "PrecipIn", "CloudCover", "Is_Rain", "Is_Snow","Is_Fog","Is_Storm"]]
X


y = Xy[["Collisions"]]
y


X.describe().iloc[[1,2,3,7,4,5,6]]


X_monthly = X.reset_index(inplace=False) # copy dataframe

X_monthly["Date"] = pd.to_datetime(X_monthly["Date"]) # convert date to proper format
X_monthly["Month"] = pd.DatetimeIndex(X_monthly['Date']).month # add month column 

# show summary statistics for each month, transpose to fit on screen
i = 0 # temperature data ranges from 0 to 8
X_monthly.groupby("Month").describe().transpose().iloc[[1+i*8,2+i*8,3+i*8,7+i*8,4+i*8,5+i*8,6+i*8]]


i = 4
X_monthly.groupby("Month").describe().transpose().iloc[[1+i*8,2+i*8,3+i*8,7+i*8,4+i*8,5+i*8,6+i*8]] # precipitation


cols = X.columns
for i in range(0,6): # range over each X column (exclude dummy variables)
    sns.relplot(data=X_monthly, x="Month", y=cols[i], kind="line");
#sns.relplot(data=X_monthly, x="Month", y="AvgTempF", kind = "line");


sns.catplot(data=X_monthly,
    x="Month", y="AvgTempF",
    kind="box"
    );


# use daily_count3 to get a longer range view
daily_count3['Month'] = pd.DatetimeIndex(daily_count3['Date']).month # add a month column

sns.relplot(data=daily_count3, 
    x="Month", y="No. of Collisions",
    kind = "line",
    );


print("Weather Features: ")
print(*np.array(X.columns), sep = ", ")

Weather Features: 
AvgTempF, AvgHumidity, AvgPressureIn, AvgWindSpdMPH, PrecipIn, CloudCover, Is_Rain, Is_Snow, Is_Fog, Is_Storm


cols = X.columns[0:6]

sns.pairplot(data=X[cols], height=2);


'''
events = ['Is_Rain', 'Is_Snow', 'Is_Fog', 'Is_Storm']
other_weather = ['AvgTempF', 'AvgHumidity', 'AvgPressureIn', 'AvgWindSpdMPH', 'CloudCover']

for i in range(len(events)): # for each weather event
    for j in range(len(other_weather)): # for each weather feature
        sns.relplot(data=X, x=events[i], y=other_weather[j]);
'''

"\nevents = ['Is_Rain', 'Is_Snow', 'Is_Fog', 'Is_Storm']\nother_weather = ['AvgTempF', 'AvgHumidity', 'AvgPressureIn', 'AvgWindSpdMPH', 'CloudCover']\n\nfor i in range(len(events)): # for each weather event\n    for j in range(len(other_weather)): # for each weather feature\n        sns.relplot(data=X, x=events[i], y=other_weather[j]);\n"


event_count = weather2["Events"].value_counts()
print(event_count)
plt.scatter(event_count, event_count.index);

Rain                      275
Snow                       58
Fog-Rain                   52
Rain-Thunderstorm          52
Rain-Snow                  32
Fog                        18
Fog-Rain-Snow              13
Fog-Snow                    5
Fog-Rain-Thunderstorm       5
Thunderstorm                1
Fog-Thunderstorm            1
Rain-Snow-Thunderstorm      1
Name: Events, dtype: int64


full_spearman = X.corr(method = 'spearman') # compute spearman correlation

for i in range(len(X.columns)):
    for j in range(len(X.columns)):
        if i==j:
            full_spearman.iloc[i,j] = None # remove all values on diagonal
            
full_spearman


# track current maximum value and position
max_val = 0
max_val_pos = [0,0]
# loop over all values in table to find maximum
for i in range(len(X.columns)):
    for j in range(len(X.columns)):
        if abs(full_spearman.iloc[i,j]) > max_val:
            max_val = full_spearman.iloc[i,j] # save new max
            max_val_pos = [i,j]
            
print('Most correlated features: ' + str(X.columns[max_val_pos[0]]) + ' and ' + str(X.columns[max_val_pos[1]]) )
print('Corresponding spearman coefficient: ' + str(max_val))

Most correlated features: PrecipIn and Is_Rain
Corresponding spearman coefficient: 0.7617920864237233


print("Pressure vs Temp for Snow Occurence")
sns.relplot(data=X, x="AvgTempF", y="AvgPressureIn", hue="Is_Snow"); # plot pressure vs precipitation with hues as storm

snow_occ = X[X["Is_Snow"] == 1] # data which snow occurs
snow_no_occ = X[X["Is_Snow"] == 0] # data which no snow occurs
print("spearman coeff. w/ snow: " + str(snow_occ["AvgTempF"].corr(snow_occ["AvgPressureIn"],"spearman"))) 
print("spearman coeff. w/o snow: " + str(snow_no_occ["AvgTempF"].corr(snow_no_occ["AvgPressureIn"],"spearman"))) 
plt.show()
print('')


print("Wind Speed vs Pressure for Storm Occurence")
sns.relplot(data=X, x="AvgPressureIn", y="AvgWindSpdMPH", hue="Is_Storm"); # plot pressure vs wind speed with hues as storm

storm_occ = X[X["Is_Storm"] == 1] # data which storm occurs
storm_no_occ = X[X["Is_Storm"] == 0] # data which no storm occurs
print("spearman coeff. w/ storm: " + str(storm_occ["AvgPressureIn"].corr(storm_occ["AvgWindSpdMPH"],"spearman"))) 
print("spearman coeff. w/o storm: " + str(storm_no_occ["AvgPressureIn"].corr(storm_no_occ["AvgWindSpdMPH"],"spearman")))
plt.show()
print('')


print("Precipitation vs Pressure for Storm Occurence")
sns.relplot(data=X, x="AvgPressureIn", y="PrecipIn", hue="Is_Storm"); # plot pressure vs precipitation with hues as storm

print("spearman coeff. w/ storm: " + str(storm_occ["AvgPressureIn"].corr(storm_occ["PrecipIn"],"spearman"))) 
print("spearman coeff. w/o storm: " + str(storm_no_occ["AvgPressureIn"].corr(storm_no_occ["PrecipIn"],"spearman")))
plt.show()
print('')
      

print("Humidity vs Cloud Coverage for Rain Occurence")
sns.relplot(data=X, x="CloudCover", y="AvgHumidity", hue="Is_Rain"); # plot cloud coverage vs humidity with hues as rain

rain_occ = X[X["Is_Rain"] == 1] # data which rain occurs
rain_no_occ = X[X["Is_Rain"] == 0] # data which no rain occurs
print("spearman coeff. w/ rain: " + str(rain_occ["CloudCover"].corr(storm_occ["AvgHumidity"],"spearman"))) 
print("spearman coeff. w/o rain: " + str(rain_no_occ["CloudCover"].corr(storm_no_occ["AvgHumidity"],"spearman")))
plt.show()

Pressure vs Temp for Snow Occurence
spearman coeff. w/ snow: -0.17878996089695212
spearman coeff. w/o snow: -0.3144606791051107

Wind Speed vs Pressure for Storm Occurence
spearman coeff. w/ storm: -0.3725827041242697
spearman coeff. w/o storm: -0.2932232506528725

Precipitation vs Pressure for Storm Occurence
spearman coeff. w/ storm: -0.13826739455694717
spearman coeff. w/o storm: -0.23667109437839273

Humidity vs Cloud Coverage for Rain Occurence
spearman coeff. w/ rain: 0.6470294494629876
spearman coeff. w/o rain: 0.35438050802803356


X_var = pd.Series(X.columns, name = "feature") # all weather features

pearsons = []
spearmans = []
for i in range(len(X_var)): # loop over each feature
    print("No. of collisions vs. " + str(X_var[i]) )
    
    # print the coefficients
    print("Pearson Coeff.: " + str(X[X_var[i]].corr(Xy['Collisions'], "pearson")))
    print("Spearman Coeff.: " + str(X[X_var[i]].corr(Xy['Collisions'], "spearman")))
    
    # add to list of coefficients
    pearsons.append(X[X_var[i]].corr(Xy['Collisions'], "pearson"))
    spearmans.append(X[X_var[i]].corr(Xy['Collisions'], "spearman"))
    
    # display plot
    if i < len(X_var) - 4: # non-categorical data
        sns.relplot(data=Xy, x=X_var[i], y='Collisions')
        plt.show() 

    print('')

No. of collisions vs. AvgTempF
Pearson Coeff.: 0.20467144652139668
Spearman Coeff.: 0.23901028163707264

No. of collisions vs. AvgHumidity
Pearson Coeff.: 0.08215998681606174
Spearman Coeff.: 0.08004977307082761

No. of collisions vs. AvgPressureIn
Pearson Coeff.: -0.028578100529787716
Spearman Coeff.: -0.04203623218258776

No. of collisions vs. AvgWindSpdMPH
Pearson Coeff.: -0.03889247226056687
Spearman Coeff.: -0.030474391389913476

No. of collisions vs. PrecipIn
Pearson Coeff.: 0.1732000911740921
Spearman Coeff.: 0.0940116119954689

No. of collisions vs. CloudCover
Pearson Coeff.: -0.0623599947930078
Spearman Coeff.: -0.07306310534018964

No. of collisions vs. Is_Rain
Pearson Coeff.: 0.07321402775491874
Spearman Coeff.: 0.06373261383027515

No. of collisions vs. Is_Snow
Pearson Coeff.: -0.07387382158194554
Spearman Coeff.: -0.10681662185406175

No. of collisions vs. Is_Fog
Pearson Coeff.: 0.021309140379378786
Spearman Coeff.: 0.0017503268421786807

No. of collisions vs. Is_Storm
Pearson Coeff.: 0.09826594937604892
Spearman Coeff.: 0.1060325779368204


# convert lists to series
pearsons_s = pd.Series(pearsons, name = "pearsons_s")
spearmans_s = pd.Series(spearmans, name = "spearmans_s") 

coeff = pd.concat([X_var, pearsons_s, spearmans_s], axis=1) # convert to dataframe

coeffs = coeff.rename(columns={"pearsons_s" : "pearson", "spearmans_s" : "spearman"}) # rename columns
coeffs = coeffs.set_index("feature") # index by weather feature

coeffs.head(10)


sns.catplot(data=Xy, 
    y="Collisions", 
    kind="box"
);

sns.displot(data=Xy, 
    x="Collisions"
    );

print('Daily Collision Data')
y_desc = pd.Series(Xy["Collisions"].describe())
print(y_desc.head(10))

num_out = 0 # number of outliers
p25 = float(y_desc[4]) # 25th percentile
p75 = float(y_desc[6]) # 75th percentile

for i in range(len(y)): # loop over each label ent
    val = y.iloc[i][0]
    if val > (p75 + 1.5*(p75-p25)) or val < (p25 - 1.5*(p75-p25)):
        num_out+=1     

print("")
print("Number of outliers: " + str(num_out))

Daily Collision Data
count    1176.000000
mean      570.761054
std        95.460790
min       188.000000
25%       510.750000
50%       571.000000
75%       629.250000
max      1161.000000
Name: Collisions, dtype: float64

Number of outliers: 17


IQR = p75 - p25
min_outlier = p25 - 1.5*IQR # compute outlier bounds
max_outlier = p75 + 1.5*IQR 

# boolean statements for filtering, True if in allowable 1.5IQR range
is_not_outlier_min = (Xy["Collisions"] > min_outlier)
is_not_outlier_max = (Xy["Collisions"] < max_outlier)

Xy2 = Xy # copy data

# apply filters
Xy2 = Xy[is_not_outlier_min & is_not_outlier_max]

Xy = Xy2 # update dataframes

X = Xy[["AvgTempF", "AvgHumidity", "AvgPressureIn", "AvgWindSpdMPH", "PrecipIn", "CloudCover", "Is_Rain", "Is_Snow","Is_Fog","Is_Storm"]]
X.head(5)


y = Xy[["Collisions"]]
y.head(5)


label_types = pd.DataFrame(data = 
    {'Symbol' : ['N', 'n', 'M', 'p', 'P'], 
     "Name" : ['Very negative', 'Moderately negative', 'Median', 'Moderately positive', 'Very positive'], 
     "Percentile Range" : [' 0 <= p <  20 ', '20 <= p <  40 ', '40 <= p <= 60 ', '60  < p <= 80 ', '80  < p <= 100'] } 
    )

label_types.head()


labels = pd.Series(y["Collisions"]) # convert to series

# compute percentiles for divisions
p20 = float(y.quantile(q=0.2)) 
p40 = float(y.quantile(q=0.4)) 
p50 = float(y.quantile(q=0.5)) 
p60 = float(y.quantile(q=0.6)) 
p80 = float(y.quantile(q=0.8)) 

print("Percentiles: " + "20th = " + str(p20) + ", 40th = " + str(p40) + ", 60th = " + str(p60) + ", 80th = " + str(p80))

yc_list = [] # label list
for i in range(len(y)):
    val = y.iloc[i][0]
    if val < p20:
        yc_list.append('N')
    if val >= p20 and val < p40:
        yc_list.append('n')
    if val >= p40 and val <= p60:
        yc_list.append('M')
    if val > p60 and val <= p80:
        yc_list.append('p')
    if val > p80:
        yc_list.append('P')
        
yc = pd.Series(data = yc_list, name = "yc", index = y.index) # convert to series

# display labels to make sure they make sense
yc_frame = pd.DataFrame(data={"Collisions" : y["Collisions"], "Label" : yc}, index = Xy.index)
yc_frame

Percentiles: 20th = 496.0, 40th = 549.0, 60th = 595.0, 80th = 645.0


X_train, X_test, y_train, y_test = train_test_split(
    X, yc,
    test_size=0.2,
    shuffle=True, 
    random_state=43913,
); # split data (features and categorical labels) between testing and training data

# if KNN
knn = KNeighborsClassifier(n_neighbors=3, weights = "distance", p = 2); # use kNN algorithm, euclidean distance
pipe = make_pipeline(StandardScaler(), knn); # standardize data

# if tree
#tree = DecisionTreeClassifier(max_depth=1) # code if using 
#pipe = make_pipeline(StandardScaler(), tree); # standardize data

pipe.fit(X_train, y_train); # train pipeline 

yhat = knn.predict(X_test); # make predictions using trained data

pd.Series(yhat, index = y_test.index).head(10) # display predicted data

C:\Users\andre\anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but KNeighborsClassifier was fitted without feature names
  warnings.warn(

Date
2013-02-24    P
2014-08-26    M
2015-06-03    M
2013-11-12    P
2014-02-11    n
2015-05-06    M
2014-06-26    M
2014-03-26    P
2014-01-11    P
2014-11-06    M
dtype: object


label_symbols = ['N', 'n', 'M', 'p', 'P'] # all possible labels

# compute and display confusion matrix
C = confusion_matrix(y_test, yhat, labels=label_symbols)
ConfusionMatrixDisplay(C, display_labels=label_symbols).plot();

pr_all = precision_score(y_test, yhat, average="macro", zero_division = 0)
print('Average precision (single): ' + str(pr_all) )

Average precision (single): 0.14396147944762983


p_hat = knn.predict_proba(X_test) # probabilistic interpretation

# calculate true and false positive rates for a variety of theta values
results = []
for i, label in enumerate(knn.classes_):
    actual = (y_test==label)
    fp, tp, theta = roc_curve(actual,p_hat[:,i])
    results.extend( [(label,fp,tp) for fp,tp in zip(fp,tp)] )
roc = pd.DataFrame( results, columns=["label","FP rate","TP rate"] )

sns.relplot(data=roc, 
    x="FP rate", y="TP rate", 
    hue="label", kind="line", estimator=None
    );

# compute auc score 
auc = roc_auc_score(y_test, p_hat, average = 'macro', multi_class = 'ovr')
print(f"AUC score (single classifier) = {auc:.6f}")

C:\Users\andre\anaconda3\lib\site-packages\sklearn\base.py:443: UserWarning: X has feature names, but KNeighborsClassifier was fitted without feature names
  warnings.warn(

AUC score (single classifier) = 0.498915


ensemble = BaggingClassifier( 
    pipe, 
    max_samples=0.8,
    n_estimators=50,
    random_state=4912
    ) # create ensemble 

ensemble.fit(X_train, y_train); # train ensemble

yhat_e = ensemble.predict(X_test); # ensemble prediction

C = confusion_matrix(y_test, yhat_e, labels=label_symbols)
ConfusionMatrixDisplay(C, display_labels=label_symbols).plot();

pr_all_e = precision_score(y_test, yhat_e, average="macro", zero_division = 0)
print('Average precision (ensemble): ' + str(pr_all_e) )

Average precision (ensemble): 0.2066064887493459


phat_e = ensemble.predict_proba(X_test); # ensemble probabilistic prediction

# calculate true and false positive rates for a variety of theta values
results = []
for i, label in enumerate(ensemble.classes_):
    actual = (y_test==label)
    fp, tp, theta = roc_curve(actual,phat_e[:,i])
    results.extend( [(label,fp,tp) for fp,tp in zip(fp,tp)] )
roc = pd.DataFrame( results, columns=["label","FP rate","TP rate"] )

sns.relplot(data=roc, 
    x="FP rate", y="TP rate", 
    hue="label", kind="line", estimator=None
    );


auc = roc_auc_score(y_test, phat_e, average = 'macro', multi_class = 'ovr')
print(f"AUC score (ensemble classifier) = {auc:.6f}")

AUC score (ensemble classifier) = 0.511558


kf = StratifiedKFold(n_splits=6, shuffle=True, random_state=15321)
scores = cross_validate(
    pipe, 
    X_train, y_train, 
    cv=kf,
    scoring="balanced_accuracy"
    )

print("Validation scores:")
print( scores["test_score"] )

print('')
print("Validation score st. dev: " + str(np.std(scores["test_score"]))) # display standard deviation

Validation scores:
[0.23125695 0.22614016 0.21190211 0.19554876 0.24601641 0.22170467]

Validation score st. dev: 0.01571246126302048


neighbors = range(1, 100, 1) # check all neighbors from n=1 to n=12

results = []
for n in neighbors:
    knn = KNeighborsClassifier(n_neighbors=n, weights = "distance", p = 2)
    cv = cross_validate(knn, 
        X_train, y_train, 
        cv=kf, 
        scoring="balanced_accuracy",
        n_jobs=-1
        )
    for err in (1 - cv["test_score"]):
      results.append( (n, err) )

results = pd.DataFrame(results, columns=["neighbors", "error"] )
sns.relplot(data=results, 
    x="neighbors", y="error", 
    kind="line"
    );


errors = results.set_index('neighbors') # use neighbors as index

error_agg = errors.groupby("neighbors")["error"].describe()['mean'] # group data by neighbors, find mean of aggregate

print(error_agg.head())

print('')
index_best_n = error_agg.argmin()
print('Optimal n: ' + str(neighbors[index_best_n]) )
print('Associated error: ' + str(error_agg.iloc[index_best_n]) )

neighbors
1    0.783605
2    0.783605
3    0.783928
4    0.781273
5    0.782426
Name: mean, dtype: float64

Optimal n: 65
Associated error: 0.7325304350512916


knn = KNeighborsClassifier(n_neighbors=65, weights = "distance", p = 2); # use kNN algorithm, euclidean distance
pipe = make_pipeline(StandardScaler(), knn); # standardize data

ensemble = BaggingClassifier( 
    pipe, 
    max_samples=0.8,
    n_estimators=50,
    random_state=4912
    ) # create ensemble 

ensemble.fit(X_train, y_train); # train ensemble

yhat_e = ensemble.predict(X_test); # ensemble prediction

C = confusion_matrix(y_test, yhat_e, labels=label_symbols)
ConfusionMatrixDisplay(C, display_labels=label_symbols).plot();

pr_all_e = precision_score(y_test, yhat_e, average="macro", zero_division = 0)
print('Average precision (ensemble): ' + str(pr_all_e) )

Average precision (ensemble): 0.2505324756544268


p_hat_e = ensemble.predict_proba(X_test) # probabilistic characterization, ensemble

auc = roc_auc_score(y_test, p_hat_e, average = 'macro', multi_class = 'ovr')
print(f"AUC score (ensemble classifier) = {auc:.6f}")

AUC score (ensemble classifier) = 0.564489


# total grid search space
grid = { "n_neighbors":range(1,150,1), 
         "weights":['uniform', 'distance'], 
         "p": [1,2]}

knn = KNeighborsClassifier(); # kNN algorithm based on grid search

kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=302)

grid_dt = GridSearchCV(
    knn, grid, 
    scoring="balanced_accuracy", 
    cv=kf,
    n_jobs=-1
    )
grid_dt.fit(X_train, y_train)

print("Best parameters:")
print(grid_dt.best_params_)
print()
print("Best score:")
print(grid_dt.best_score_)

Best parameters:
{'n_neighbors': 111, 'p': 1, 'weights': 'uniform'}

Best score:
0.28887508727530886


# Confusion Matrix, 
knn = KNeighborsClassifier(n_neighbors=111, weights = "uniform", p = 1); # use kNN algorithm based on optimal hyperparameters
pipe = make_pipeline(StandardScaler(), knn); # standardize data

ensemble = BaggingClassifier( 
    pipe, 
    max_samples=0.8,
    n_estimators=50,
    random_state=4912
    ) # create ensemble 

ensemble.fit(X_train, y_train); # train ensemble

yhat_e = ensemble.predict(X_test); # ensemble prediction

C = confusion_matrix(y_test, yhat_e, labels=label_symbols)
ConfusionMatrixDisplay(C, display_labels=label_symbols).plot();

pr_all_e = precision_score(y_test, yhat_e, average="macro", zero_division = 0)
print('Ideal average precision (ensemble): ' + str(pr_all_e) )

# AUC score
p_hat_e = ensemble.predict_proba(X_test) # probabilistic characterization, ensemble

auc = roc_auc_score(y_test, p_hat_e, average = 'macro', multi_class = 'ovr')
print(f"Ideal AUC score (ensemble) = {auc:.6f}")

Ideal average precision (ensemble): 0.19542002380148277
Ideal AUC score (ensemble) = 0.572560

	CRASH DATE	CRASH TIME	BOROUGH	ZIP CODE	LATITUDE	LONGITUDE	LOCATION	ON STREET NAME	CROSS STREET NAME	OFF STREET NAME	...	CONTRIBUTING FACTOR VEHICLE 2	CONTRIBUTING FACTOR VEHICLE 3	CONTRIBUTING FACTOR VEHICLE 4	CONTRIBUTING FACTOR VEHICLE 5	COLLISION_ID	VEHICLE TYPE CODE 1	VEHICLE TYPE CODE 2	VEHICLE TYPE CODE 3	VEHICLE TYPE CODE 4	VEHICLE TYPE CODE 5
0	07/22/2019	19:00	BROOKLYN	11208.0	40.679080	-73.877210	POINT (-73.87721 40.67908)	NaN	NaN	289 LOGAN STREET	...	NaN	NaN	NaN	NaN	4175832	Sedan	NaN	NaN	NaN	NaN
1	08/11/2019	15:35	BROOKLYN	11234.0	40.606000	-73.928450	POINT (-73.92845 40.606)	AVENUE U	EAST 35 STREET	NaN	...	Driver Inattention/Distraction	NaN	NaN	NaN	4190884	Sedan	Sedan	NaN	NaN	NaN
2	07/31/2019	15:10	BROOKLYN	11205.0	40.699192	-73.955190	POINT (-73.95519 40.699192)	FLUSHING AVENUE	WALWORTH STREET	NaN	...	Unspecified	NaN	NaN	NaN	4180546	Box Truck	Station Wagon/Sport Utility Vehicle	NaN	NaN	NaN
3	08/14/2019	17:00	BROOKLYN	11206.0	40.696354	-73.940710	POINT (-73.94071 40.696354)	MARCUS GARVEY BOULEVARD	MYRTLE AVENUE	NaN	...	Unspecified	NaN	NaN	NaN	4189386	Sedan	Sedan	NaN	NaN	NaN
4	07/22/2019	12:25	QUEENS	11374.0	NaN	NaN	NaN	Woodhaven Boulevard	Metropolitan Avenue	NaN	...	Unspecified	NaN	NaN	NaN	4175283	Box Truck	Sedan	NaN	NaN	NaN
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1635885	07/06/2012	23:45	BRONX	10454	40.809107	-73.922887	POINT (-73.9228869 40.8091071)	EAST 138 STREET	WILLIS AVENUE	NaN	...	NaN	NaN	NaN	NaN	72627	PASSENGER VEHICLE	NaN	NaN	NaN	NaN
1635886	07/05/2012	18:11	QUEENS	11361	40.766554	-73.772545	POINT (-73.7725446 40.7665544)	BELL BOULEVARD	38 AVENUE	NaN	...	Unspecified	NaN	NaN	NaN	259386	SPORT UTILITY / STATION WAGON	SPORT UTILITY / STATION WAGON	NaN	NaN	NaN
1635887	07/02/2012	22:56	QUEENS	11385	40.700593	-73.854565	POINT (-73.8545649 40.7005934)	WOODHAVEN BOULEVARD	FOREST PARK DRIVE	NaN	...	Unspecified	NaN	NaN	NaN	203707	UNKNOWN	BICYCLE	NaN	NaN	NaN
1635888	07/06/2012	12:00	BROOKLYN	11204	40.619963	-73.982281	POINT (-73.9822811 40.6199627)	59 STREET	20 AVENUE	NaN	...	Unspecified	NaN	NaN	NaN	129942	SPORT UTILITY / STATION WAGON	UNKNOWN	NaN	NaN	NaN
1635889	07/03/2012	14:50	MANHATTAN	10012	40.725083	-73.995328	POINT (-73.9953279 40.7250834)	EAST HOUSTON STREET	LAFAYETTE STREET	NaN	...	Unspecified	NaN	NaN	NaN	5309	LARGE COM VEH(6 OR MORE TIRES)	PICK-UP TRUCK	NaN	NaN	NaN

	Date	No. of Collisions
1964	2012-07-01	538
1716	2012-07-02	564
653	2012-07-03	664
2601	2012-07-04	432
1438	2012-07-05	591
...	...	...
2704	2020-01-07	381
2655	2020-01-08	410
2411	2020-01-09	479
2587	2020-01-10	438
2728	2020-01-11	350

	No. of Collisions
Date
2012-10-12	594
2012-10-13	547
2012-10-14	517
2012-10-15	607
2012-10-16	602
...	...
2015-12-27	518
2015-12-28	490
2015-12-29	533
2015-12-30	501
2015-12-31	527

	Collisions
Date
2012-10-12	594
2012-10-13	547
2012-10-14	517
2012-10-15	607
2012-10-16	602
...	...
2015-12-27	518
2015-12-28	490
2015-12-29	533
2015-12-30	501
2015-12-31	527

	AvgTempF	AvgHumidity	AvgPressureIn	AvgWindSpdMPH	PrecipIn	CloudCover	Is_Rain	Is_Snow	Is_Fog	Is_Storm
mean	54.597789	63.653912	30.052738	11.244048	0.112355	5.136054	0.365646	0.092687	0.079932	0.051020
std	17.217773	13.884268	0.218680	4.300911	0.344442	2.053379	0.481816	0.290117	0.271303	0.220133
min	12.000000	27.000000	28.980000	2.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
max	89.000000	97.000000	30.760000	36.000000	4.680000	8.000000	1.000000	1.000000	1.000000	1.000000
25%	41.000000	53.000000	29.910000	8.000000	0.000000	4.000000	0.000000	0.000000	0.000000	0.000000
50%	55.000000	64.000000	30.050000	11.000000	0.000000	5.000000	0.000000	0.000000	0.000000	0.000000
75%	70.000000	74.000000	30.200000	14.000000	0.030000	7.000000	1.000000	0.000000	0.000000	0.000000

Effects of Weather on Predicted Motor Vehicle Accident Reports in New York City Using Supervised Machine Learning

Andrew Smith, April 10th, 2023

1 Introduction¶

1.1 Project Scope¶

1.2 Hypothesis¶

1.3 Raw Data Sources¶

2 Date Preparation¶

2.1 Project Scope Range¶

2.2 Weather Feature Frame¶

2.3 Daily Reported Vehicle Collisions Aggregation¶

3 Summary Statistics¶

3.1 Generalized Climate Classification¶

3.2 Annual Data Trends¶

4 Correlations¶

4.1 One-Dimensional Feature Dependency¶

4.2 Two-Dimensional Feature Dependency¶

4.3 Label Correlation¶

5 Classification Algorithm¶

5.1 Data Preparation and Algorithm Selection¶

5.2 Single k-Nearest Neighbors Classification¶

5.3 Ensemble k-Nearest Neighbors Classification¶

6 Performance Evaluation¶

6.1 Hyperparameter Optimization¶

6.2 Grid Search Optimization¶

7 Conclusion and Discussion¶

8 References¶

	Unnamed: 0	Date	Max.TemperatureF	Mean.TemperatureF	Min.TemperatureF	Max.Dew.PointF	MeanDew.PointF	Min.DewpointF	Max.Humidity	Mean.Humidity	...	Min.VisibilityMiles	Max.Wind.SpeedMPH	Mean.Wind.SpeedMPH	Max.Gust.SpeedMPH	PrecipitationIn	CloudCover	Events	WindDirDegrees.br...	city	season
0	1	1948-07-01	84	78.0	72	71	65	58	93	65	...	2.0	16	8	NaN	0.00	0.0	Fog	264<br />	New York City (USA)	Summer
1	2	1948-07-02	82	72.0	63	62	53	49	76	51	...	10.0	16	10	NaN	0.00	0.0	NaN	315<br />	New York City (USA)	Summer
2	3	1948-07-03	78	71.0	64	66	58	53	84	62	...	5.0	14	6	NaN	0.00	0.0	NaN	203<br />	New York City (USA)	Summer
3	4	1948-07-04	84	76.0	68	68	63	56	90	67	...	2.0	12	5	NaN	0.00	0.0	Fog	198<br />	New York City (USA)	Summer
4	5	1948-07-05	93	82.0	70	74	69	65	93	71	...	3.0	18	8	NaN	0.00	0.0	Fog-Rain-Thunderstorm	218<br />	New York City (USA)	Summer
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
24555	24623	2015-12-27	63	56.0	48	58	51	37	96	82	...	0.0	23	12	29.0	0.08	8.0	Fog-Rain	283<br />	New York City (USA)	Winter
24556	24624	2015-12-28	48	42.0	35	35	25	19	70	57	...	6.0	23	14	37.0	0.05	7.0	Rain-Snow	39<br />	New York City (USA)	Winter
24557	24625	2015-12-29	49	42.0	34	46	40	30	93	88	...	1.0	24	12	30.0	0.66	8.0	Rain-Snow	28<br />	New York City (USA)	Winter
24558	24626	2015-12-30	51	46.0	41	48	41	37	93	85	...	2.0	22	7	26.0	0.37	8.0	Rain	65<br />	New York City (USA)	Winter
24559	24627	2015-12-31	52	48.0	43	46	38	27	93	71	...	5.0	20	12	25.0	0.06	7.0	Rain	317<br />	New York City (USA)	Winter

	Unnamed: 0	Date	Max.TemperatureF	Mean.TemperatureF	Min.TemperatureF	Max.Dew.PointF	MeanDew.PointF	Min.DewpointF	Max.Humidity	Mean.Humidity	...	Min.VisibilityMiles	Max.Wind.SpeedMPH	Mean.Wind.SpeedMPH	Max.Gust.SpeedMPH	PrecipitationIn	CloudCover	Events	WindDirDegrees.br...	city	season
23384	23449	2012-10-12	60	52.0	43	44	35	23	80	58	...	10.0	32	15	38.0	T	3.0	NaN	298<br />	New York City (USA)	Autumn
23385	23450	2012-10-13	55	46.0	37	43	27	18	61	44	...	10.0	24	9	29.0	0.00	2.0	NaN	201<br />	New York City (USA)	Autumn
23386	23451	2012-10-14	72	64.0	55	59	52	44	90	69	...	10.0	24	17	29.0	0.00	6.0	NaN	210<br />	New York City (USA)	Autumn
23387	23452	2012-10-15	69	65.0	61	63	60	57	93	83	...	2.0	23	14	28.0	0.18	7.0	Rain	208<br />	New York City (USA)	Autumn
23388	23453	2012-10-16	61	55.0	48	56	38	34	84	62	...	10.0	29	15	36.0	0.00	3.0	NaN	320<br />	New York City (USA)	Autumn

	Month	1	2	3	4	5	6	7	8	9	10	11	12
AvgTempF	mean	31.655914	30.130952	38.612903	51.322222	62.473118	71.377778	78.021505	75.892473	70.055556	59.194690	46.891667	43.169355
	std	9.399924	7.708202	7.095670	6.003422	5.775871	5.529447	4.091291	3.402245	6.372979	6.355339	8.036467	7.926206
	min	13.000000	12.000000	19.000000	39.000000	51.000000	54.000000	67.000000	69.000000	58.000000	44.000000	28.000000	26.000000
	max	49.000000	46.000000	52.000000	65.000000	76.000000	83.000000	89.000000	83.000000	83.000000	73.000000	69.000000	63.000000
	25%	25.000000	25.750000	35.000000	47.000000	59.000000	68.250000	75.000000	73.000000	64.000000	55.000000	41.750000	37.000000
	50%	33.000000	29.000000	37.000000	52.000000	62.000000	72.000000	78.000000	76.000000	70.000000	60.000000	47.000000	43.000000
	75%	39.000000	36.250000	44.000000	55.000000	66.000000	75.000000	80.000000	78.000000	74.750000	63.000000	52.250000	48.250000

	Month	1	2	3	4	5	6	7	8	9	10	11	12
PrecipIn	mean	0.113441	0.11750	0.122688	0.115222	0.086989	0.172778	0.101075	0.113871	0.065889	0.082566	0.081500	0.171452
	std	0.261187	0.28166	0.334333	0.517664	0.249703	0.501237	0.277212	0.427823	0.204131	0.293582	0.252605	0.384580
	min	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	max	1.620000	1.27000	2.600000	4.680000	1.540000	4.010000	1.870000	2.920000	1.180000	1.810000	1.410000	3.040000
	25%	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	50%	0.000000	0.00000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
	75%	0.060000	0.04750	0.040000	0.025000	0.020000	0.087500	0.030000	0.000000	0.000000	0.010000	0.000000	0.135000

	AvgTempF	AvgHumidity	AvgPressureIn	AvgWindSpdMPH	PrecipIn	CloudCover	Is_Rain	Is_Snow	Is_Fog	Is_Storm
AvgTempF	NaN	0.276427	-0.242008	-0.251217	-0.029748	-0.054441	0.107183	-0.411795	-0.069609	0.236140
AvgHumidity	0.276427	NaN	-0.221299	-0.215239	0.583763	0.625396	0.566843	-0.005344	0.343545	0.214482
AvgPressureIn	-0.242008	-0.221299	NaN	-0.284828	-0.284235	-0.211834	-0.302258	-0.088026	-0.112153	-0.189865
AvgWindSpdMPH	-0.251217	-0.215239	-0.284828	NaN	0.069912	-0.045296	0.035806	0.151964	-0.041808	-0.034684
PrecipIn	-0.029748	0.583763	-0.284235	0.069912	NaN	0.621792	0.761792	0.211847	0.281316	0.330087
CloudCover	-0.054441	0.625396	-0.211834	-0.045296	0.621792	NaN	0.615265	0.213627	0.245711	0.155206
Is_Rain	0.107183	0.566843	-0.302258	0.035806	0.761792	0.615265	NaN	0.037411	0.231971	0.289359
Is_Snow	-0.411795	-0.005344	-0.088026	0.151964	0.211847	0.213627	0.037411	NaN	0.100422	-0.060783
Is_Fog	-0.069609	0.343545	-0.112153	-0.041808	0.281316	0.245711	0.231971	0.100422	NaN	0.017158
Is_Storm	0.236140	0.214482	-0.189865	-0.034684	0.330087	0.155206	0.289359	-0.060783	0.017158	NaN

	pearson	spearman
feature
AvgTempF	0.204671	0.239010
AvgHumidity	0.082160	0.080050
AvgPressureIn	-0.028578	-0.042036
AvgWindSpdMPH	-0.038892	-0.030474
PrecipIn	0.173200	0.094012
CloudCover	-0.062360	-0.073063
Is_Rain	0.073214	0.063733
Is_Snow	-0.073874	-0.106817
Is_Fog	0.021309	0.001750
Is_Storm	0.098266	0.106033

	Symbol	Name	Percentile Range
0	N	Very negative	0 <= p < 20
1	n	Moderately negative	20 <= p < 40
2	M	Median	40 <= p <= 60
3	p	Moderately positive	60 < p <= 80
4	P	Very positive	80 < p <= 100