import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx

from matplotlib import pyplot as plt
from matplotlib import image as mpimg

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import silhouette_samples
from sklearn.mixture import GaussianMixture
from sklearn.cluster import SpectralClustering

from warnings import simplefilter


counties = pd.read_csv('https://corgis-edu.github.io/corgis/datasets/csv/county_demographics/county_demographics.csv', index_col=0)
counties.head(5)


cols = pd.Series(counties.columns)
cols

0                                                 State
1                              Age.Percent 65 and Older
2                            Age.Percent Under 18 Years
3                             Age.Percent Under 5 Years
4                 Education.Bachelor's Degree or Higher
5                       Education.High School or Higher
6                 Employment.Nonemployer Establishments
7     Ethnicities.American Indian and Alaska Native ...
8                               Ethnicities.Asian Alone
9                               Ethnicities.Black Alone
10                       Ethnicities.Hispanic or Latino
11    Ethnicities.Native Hawaiian and Other Pacific ...
12                        Ethnicities.Two or More Races
13                              Ethnicities.White Alone
14     Ethnicities.White Alone\t not Hispanic or Latino
15                           Housing.Homeownership Rate
16                                   Housing.Households
17                                Housing.Housing Units
18         Housing.Median Value of Owner-Occupied Units
19                        Housing.Persons per Household
20                        Income.Median Houseold Income
21                             Income.Per Capita Income
22                           Miscellaneous.Foreign Born
23                              Miscellaneous.Land Area
24    Miscellaneous.Language Other than English at Home
25          Miscellaneous.Living in Same House +1 Years
26                Miscellaneous.Manufacturers Shipments
27               Miscellaneous.Mean Travel Time to Work
28                         Miscellaneous.Percent Female
29                               Miscellaneous.Veterans
30                           Population.2020 Population
31                           Population.2010 Population
32                Population.Population per Square Mile
33          Sales.Accommodation and Food Services Sales
34                                   Sales.Retail Sales
35                               Employment.Firms.Total
36                         Employment.Firms.Women-Owned
37                           Employment.Firms.Men-Owned
38                      Employment.Firms.Minority-Owned
39                   Employment.Firms.Nonminority-Owned
40                       Employment.Firms.Veteran-Owned
41                    Employment.Firms.Nonveteran-Owned
dtype: object


counties.reset_index(inplace=True)
counties['County'] = counties['County'].astype(str) +", "+ counties["State"].astype(str) # concatenate columns
counties = counties.set_index("County") # set as index again

counties.drop(columns=["State"], inplace=True)


harco = counties.loc[["Harford County, MD"]]
harco


X = (counties - counties.mean())/counties.std() # convert to z-scores
X.head(5)


simplefilter(action='ignore', category=UserWarning) # supress warnings
X.plot(kind='box', figsize=(20,5))

<AxesSubplot:>


def is_outlier(x):
    Q25, Q75 = x.quantile([.25,.75])
    I = Q75 - Q25
    return (x < Q25 - 1.5*I) |  (x > Q75 + 1.5*I)

outliers = X.transform(is_outlier)

Xreg = X[~outliers] # non-outlier data
Xreg.dropna(inplace = True) # outliers flagged with NaN

print("Number of US counties: " + str(len(X)))
print("Number of outliers: " + str(len(Xreg)))
print("Percentage of outlying counties: " + f"{100*len(Xreg)/len(X):0.2f}" + "%")

Number of US counties: 3139
Number of outliers: 1084
Percentage of outlying counties: 34.53%


adr = pd.read_csv('https://data.nber.org/census/geo/county-adjacency/2010/county_adjacency2010.csv')
adr.head(5) # ADjacency data, Raw (before processing)


county=[] # "from"
neighbor=[] # "to"
weight=[] # euclidean weight from "county" to "neighbor"

for i in range(len(adr)): # loop over each row
    cty = adr.iloc[i, 0] # county to check
    nbr = adr.iloc[i, 2] # neighbor to check
    if cty != nbr: # ignore self-intersection
        if (cty in X.index) and (nbr in X.index): # both counties are in before data set
            county.append(cty)
            neighbor.append(nbr)
            A = X.loc[cty] # row of county data
            B = X.loc[nbr] # row of neighbor data
            weight.append(np.sqrt(np.sum([(a-b)*(a-b) for a, b in zip(np.array(A), np.array(B))])) ) # compute Euclidean metric

#A = np.array(X.loc[adr.iloc[1,0]])
#A


adjacency = pd.DataFrame({"from" : county, "to" : neighbor, "weight": weight})
adjacency.head(5)


adj = nx.from_pandas_edgelist(adjacency, "from", "to") # create network

sns.displot(data=adjacency, x="weight", log_scale=True);

print("Nodes (n): ", adj.number_of_nodes())
print("Edges (m): ", adj.number_of_edges())

degrees = pd.Series( dict(adj.degree) ) # compute degree of each county
print("Average degree (k): ", degrees.mean())

#adj_con = max(nx.connected_components(adj), key=len) 
# connected subgraphs of US (AL, HI, continental, puerto rico, guam)
subs = [adj.subgraph(c).copy() for c in nx.connected_components(adj)]
sub_diam = {0:nx.diameter(subs[0]), 1:nx.diameter(subs[1]), 2:nx.diameter(subs[2]), 3:nx.diameter(subs[3]), 4:nx.diameter(subs[4])}
print("Subgraph diameters (d): ")
print(sub_diam)

Nodes (n):  3134
Edges (m):  9274
Average degree (k):  5.918315252074027
Subgraph diameters (d): 
{0: 68, 1: 5, 2: 3, 3: 1, 4: 1}


km = KMeans(n_clusters=2, n_init=3, verbose=0, random_state=2002);
km.fit(X);


ks = [] # number of clusters
avg_sil = [] # average silhouette of each cluster

for k in range(2, 5):
    km = KMeans(n_clusters=k, n_init=10, verbose=0, random_state=2002);
    km.fit(X);
    y_hat = km.labels_ # cluster assignments
    ks.append(k)
    
    result = X.copy()
    result["cluster"] = y_hat # add cluster label
    result["sil"] = silhouette_samples(X, y_hat) # compute silhouette of cluster
    
    print("Clusters: " , str(k))
    print(f"inertia: {km.inertia_:.5g}")
    print("silhouette medians by cluster:")
    print( result.groupby("cluster")["sil"].median() )
    
    avg = np.average(result.groupby("cluster")["sil"].median())
    avg_sil.append(avg)
    print("average silhouette: " + str(avg))
    
    sns.catplot(data=result,
                x="cluster", y="sil",
                kind="violin", height=5
            )
    plt.show()

Clusters:  2
inertia: 1.0185e+05
silhouette medians by cluster:
cluster
0    0.782237
1    0.078960
Name: sil, dtype: float64
average silhouette: 0.43059868654818323

Clusters:  3
inertia: 88177
silhouette medians by cluster:
cluster
0    0.556099
1    0.097483
2    0.060976
Name: sil, dtype: float64
average silhouette: 0.23818595882335125

Clusters:  4
inertia: 78931
silhouette medians by cluster:
cluster
0    0.099352
1   -0.047595
2    0.073739
3    0.394433
Name: sil, dtype: float64
average silhouette: 0.12998212092376385


sil_results = pd.DataFrame({"k" : ks, "Avg. Silhouette" : avg_sil})
sil_results = sil_results.set_index("k")

fsil_0 = sil_results.iloc[0][0] # save to final results
fsil_0b = sil_results.iloc[1][0] # save to final results

sil_results


km = KMeans(n_clusters=2, n_init=10, verbose=0, random_state=2002); # best clustering
km.fit(X);
y_hat = km.labels_ # cluster assignments

X2 = X.copy() 
X2["cluster"] = y_hat # add cluster assignments

c_vals = pd.Series(y_hat)
print("Number of counties in each cluster: ")
c_vals.value_counts()

Number of counties in each cluster:

0    3084
1      55
dtype: int64


county_labels = pd.DataFrame(y_hat, index=counties.index)
county_labels.head(5)


#county_labels.to_csv("county_labels.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("k_means_geo_k=2.png")
plt.imshow(image)
plt.show()


ks = [] # number of clusters
links = [] # linkage type
avg_sil = [] # average silhouette of each cluster

for k in range(2,5):
    for l in ["single", "complete", "average", "ward"]: 
        # Create and fit cluster
        agg = AgglomerativeClustering(n_clusters=k, linkage=l);
        agg.fit(X); 
        z_hat = agg.labels_ # cluster assignments
               
        results = X.copy()
        results["cluster"] = z_hat # add cluster label
        results["sil"] = silhouette_samples(X, z_hat) # compute silhouette of cluster
               
        avg = np.average(results.groupby("cluster")["sil"].median()) # average cluster score
        
        # Add data to list
        links.append(l)
        ks.append(k)
        avg_sil.append(avg)

# Create dataframe, sort values
agg_results = pd.DataFrame({"k" : ks, "Linkage" : links, "Avg. Silhouette" : avg_sil})
agg_results.sort_values(by="Avg. Silhouette", ascending=False, inplace=True)

fsil_1 = agg_results.iloc[0][2] # save to final results
fsil_1b = agg_results.iloc[4][2] # save to final results
fsil_1c = agg_results.iloc[6][2] # save to final results

agg_results


agg = AgglomerativeClustering(n_clusters=2, linkage="ward");
agg.fit(X); 
z_hat = agg.labels_ # cluster assignments
               
results = X.copy()
results["cluster"] = z_hat # add cluster label
results["sil"] = silhouette_samples(X, z_hat) # compute silhouette of cluster 
print("Silhouette medians by cluster:")
print( pd.Series(results.groupby("cluster")["sil"].median()).to_dict() )

sns.catplot(data=results, x="cluster", y="sil", kind="violin", height=5)
plt.show()

c_vals = pd.Series(z_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())

Silhouette medians by cluster:
{0: 0.8564568180909888, 1: 0.13568339782639516}

Number of counties in each cluster: 
{0: 3119, 1: 20}


county_labels = pd.DataFrame(z_hat, index=counties.index)
county_labels.head(5)


#county_labels.to_csv("county_labels_2.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("agg_ward_geo_k=2.png")
plt.imshow(image)
plt.show()


ks = [] # number of clusters
covs = [] # type of covariance to use
avg_sil = [] # average silhouette of each cluster

for k in range(2,5):
    for cov in ['full', 'tied', 'diag', 'spherical']: 
        gm = GaussianMixture(n_components=k, covariance_type=cov, init_params='kmeans', random_state=2002)
        gm.fit(X)
        w_hat = gm.predict(X) # probabilistic prediction
        
        results = X.copy()
        results["cluster"] = w_hat # add cluster label
        results["sil"] = silhouette_samples(X, w_hat) # compute silhouette of cluster 
        
        avg = np.average(results.groupby("cluster")["sil"].median()) # average cluster score
        
        # Add data to list
        covs.append(cov)
        ks.append(k)
        avg_sil.append(avg)

# Create dataframe, sort values
em_results = pd.DataFrame({"k" : ks, "Covariance" : covs, "Avg. Silhouette" : avg_sil})
em_results.sort_values(by="Avg. Silhouette", ascending=False, inplace=True)

fsil_2 = em_results.iloc[0][2] # save to final results
fsil_2b = em_results.iloc[1][2] # save to final results

em_results


gm = GaussianMixture(n_components=2, covariance_type='tied', init_params='kmeans', random_state=2002)
gm.fit(X); 
w_hat = gm.predict(X) # probabilistic prediction

results = X.copy()
results["cluster"] = w_hat # add cluster label
results["sil"] = silhouette_samples(X, w_hat) # compute silhouette of cluster 
    
print("Silhouette medians by cluster:")
print( pd.Series(results.groupby("cluster")["sil"].median()).to_dict() )

sns.catplot(data=results, x="cluster", y="sil", kind="violin", height=5)
plt.show()

c_vals = pd.Series(w_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())

Silhouette medians by cluster:
{0: 0.7580883927930174, 1: 0.05646592473471834}

Number of counties in each cluster: 
{0: 3073, 1: 66}


# k=2, cov = tied
gm = GaussianMixture(n_components=2, covariance_type='tied', init_params='kmeans', random_state=2002)
gm.fit(X); 
w_hat = gm.predict_proba(X) # probabilistic prediction

gmp = pd.DataFrame(w_hat, index=X.index)[0]
gmp = gmp.sort_values(ascending=False) # probability of being in cluster 1


county_labels = pd.DataFrame(w_hat, index=counties.index)
#county_labels.to_csv("county_labels_3.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("em_geo_k=2.png")
plt.imshow(image)
plt.show()

w_hat = gm.predict(X)
c_vals = pd.Series(w_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(em_results.iloc[0]["Avg. Silhouette"]) )

Number of counties in each cluster: 
{0: 3073, 1: 66}
Average cluster silhouette: 0.4072771587638679


# k=3, cov = tied
gm = GaussianMixture(n_components=3, covariance_type='tied', init_params='kmeans', random_state=2002)
gm.fit(X); 
w_hat = gm.predict_proba(X) # probabilistic prediction

gmp = pd.DataFrame(w_hat, index=X.index) # probability of being in a given cluster
gmp.head(5)


w_hat = gm.predict(X) # rounded prediction, ignore probability
gmp = pd.DataFrame(w_hat, index=X.index)
county_labels = pd.DataFrame(w_hat, index=counties.index)
#county_labels.to_csv("county_labels_5.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("em_geo_k=3.png")
plt.imshow(image)
plt.show()

c_vals = pd.Series(w_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(em_results.iloc[1]["Avg. Silhouette"]) )

Number of counties in each cluster: 
{0: 3019, 2: 107, 1: 13}
Average cluster silhouette: 0.2544366630404996


ego_harco = nx.ego_graph(adj, "Harford County, MD", distance='weight', radius=1)
nx.draw(ego_harco, with_labels=True, node_size=500, node_color="orange")


simplefilter(action='ignore', category=UserWarning) # supress warnings

ns = [] # clusters
ks = [] # nearest neighbors
ls = [] # label assignment method
sils = [] # average silhouette values

for n in range(2,5): # clusters
    for k in range(2, 10): # n-neighbors
        for l in ["kmeans", "discretize"]:
            # Define and fit clustering
            spectral_clustering = SpectralClustering(
                n_clusters=n, 
                random_state = 2002,
                n_init = 6,
                affinity="nearest_neighbors",
                n_neighbors = k,
                assign_labels=l,
                n_jobs=-1
            ) # hyperparameters used
            spc = spectral_clustering.fit(X) # fit clustering
            s_hat = spc.labels_ # get labels
            
            results = X.copy()
            results["cluster"] = s_hat # add cluster label
            results["sil"] = silhouette_samples(X, s_hat) # compute silhouette of cluster 
            
            avg = np.average(results.groupby("cluster")["sil"].median()) # average cluster score
            
            # Add to data
            ns.append(n)
            ks.append(k)
            ls.append(l)
            sils.append(avg)


# Create dataframe
spc_vals = pd.DataFrame({'Clusters' : ns, 'Neighbors' : ks, 'Labels' : ls, 'Avg Silhouette' : sils})
spc_vals = spc_vals.sort_values(by='Avg Silhouette', ascending=False)

fsil_3 = spc_vals.iloc[0][3] # save to final results
fsil_3b = spc_vals.iloc[5][3] # save to final results

spc_vals.head(10)


spectral_clustering = SpectralClustering(
    n_clusters=2, 
    random_state = 2002,
    n_init = 10,
    affinity="nearest_neighbors",
    n_neighbors = 4,
    assign_labels="kmeans",
    n_jobs=-1
) # define method used
spc = spectral_clustering.fit(X) # fit clustering
s_hat = spc.labels_ # get labels

results = X.copy()
results["cluster"] = s_hat # add cluster label
results["sil"] = silhouette_samples(X, s_hat) # compute silhouette of cluster 
    
print("Silhouette medians by cluster:")
sils = pd.Series(results.groupby("cluster")["sil"].median())
print( sils.to_dict() )
print("Average silhouette:")
print(np.average(sils))

sns.catplot(data=results, x="cluster", y="sil", kind="violin", height=5)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())

Silhouette medians by cluster:
{0: 0.7973155485697738, 1: 0.26116522225869837}
Average silhouette:
0.5292403854142361

Number of counties in each cluster: 
{0: 3134, 1: 5}


county_labels = pd.DataFrame(s_hat, index=counties.index)
#county_labels.to_csv("county_labels_6.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("spc_geo_k=2.png")
plt.imshow(image)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(spc_vals.iloc[0]["Avg Silhouette"]) )

Number of counties in each cluster: 
{0: 3134, 1: 5}
Average cluster silhouette: 0.5292403854142361


spectral_clustering = SpectralClustering(
    n_clusters=4, 
    random_state = 2002,
    n_init = 10,
    affinity="nearest_neighbors",
    n_neighbors = 50,
    assign_labels="kmeans",
    n_jobs=-1
) # define method used
spc = spectral_clustering.fit(X) # fit clustering
s_hat = spc.labels_ # get labels

results = X.copy()
results["cluster"] = s_hat # add cluster label
results["sil"] = silhouette_samples(X, s_hat) # compute silhouette of cluster 
    
print("Silhouette medians by cluster:")
sils = pd.Series(results.groupby("cluster")["sil"].median())
print( sils.to_dict() )
print("Average silhouette:")
print(np.average(sils))

sns.catplot(data=results, x="cluster", y="sil", kind="violin", height=5)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())

Silhouette medians by cluster:
{0: 0.26635253096651945, 1: -0.03388292282747894, 2: 0.29310678353792513, 3: 0.2238888604862083}
Average silhouette:
0.18736631304079346

Number of counties in each cluster: 
{3: 2351, 2: 386, 1: 218, 0: 184}


county_labels = pd.DataFrame(s_hat, index=X.index)
#county_labels.to_csv("county_labels_7.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("spc_geo_k=4.png")
plt.imshow(image)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(spc_vals.iloc[9]["Avg Silhouette"]) )

fsil_3c = spc_vals.iloc[9]["Avg Silhouette"]

Number of counties in each cluster: 
{3: 2351, 2: 386, 1: 218, 0: 184}
Average cluster silhouette: 0.30985386834088685


spectral_clustering = SpectralClustering(
    n_clusters=4, 
    random_state = 2002,
    n_init = 10,
    affinity="nearest_neighbors",
    n_neighbors = int(len(X)/4),
    assign_labels="kmeans",
    n_jobs=-1
) # define method used
spc = spectral_clustering.fit(X) # fit clustering
s_hat = spc.labels_ # get labels

results = X.copy()
results["cluster"] = s_hat # add cluster label
results["sil"] = silhouette_samples(X, s_hat) # compute silhouette of cluster 
    
print("Silhouette medians by cluster:")
sils = pd.Series(results.groupby("cluster")["sil"].median())
print( sils.to_dict() )
print("Average silhouette:")
print(np.average(sils))
fsil_4 = np.average(sils) # save to final results

sns.catplot(data=results, x="cluster", y="sil", kind="violin", height=5)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())

Silhouette medians by cluster:
{0: -0.16217613847803253, 1: 0.14049076973456662, 2: 0.3304741413054011, 3: -0.08098758967566834}
Average silhouette:
0.05695029572156671

Number of counties in each cluster: 
{2: 1223, 3: 659, 0: 646, 1: 611}


county_labels = pd.DataFrame(s_hat, index=X.index)
#county_labels.to_csv("county_labels_8.csv") # save data file


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("spc_reg_geo_k=4.png")
plt.imshow(image)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(np.average(sils)) )

Number of counties in each cluster: 
{2: 1223, 3: 659, 0: 646, 1: 611}
Average cluster silhouette: 0.05695029572156671


results_data = {"Method" : ["k-Means", "k-Means", "Agglomerative", "Agglomerative", "Agglomerative", "Expectation-Maximization", "Expectation-Maximization", "Spectral", "Spectral", "Spectral", "Spectral (Regional)"],
                "Clusters" : [2,3,2,3,4,2,3,2,3,4,4],
                "Linkage" : ["-", "-", "Ward", "Average", "Average", "-", "-", "-", "-", "-", "-"],
                "Covariance" : ["-", "-", "-", "-", "-", "Tied", "Tied", "-", "-", "-", "-"],
                "Labelling" : ["-", "-", "-", "-", "-", "-", "-", "k-Means", "k-Means", "k-Means", "k-Means"],
                "Neighbors" : ["-", "-", "-", "-", "-", "-", "-", 4, 4, 4, 784],
                "Avg. Silhouette" : [fsil_0, fsil_0b, fsil_1, fsil_1b, fsil_1c, fsil_2, fsil_2b, fsil_3, fsil_3b, fsil_3c, fsil_4]
}
results = pd.DataFrame(data=results_data)
results = results.sort_values(by="Avg. Silhouette", ascending=False) # sort values
results = results.reset_index() # reset indices
results.drop(columns=["index"], inplace=True)
results


plt.figure(figsize=(15,8))
plt.axis('off')
 
image = mpimg.imread("spc_geo_k=2.png")
plt.imshow(image)
plt.show()

c_vals = pd.Series(s_hat)
print("Number of counties in each cluster: ")
print(c_vals.value_counts().to_dict())
print("Average cluster silhouette: " + str(spc_vals.iloc[0]["Avg Silhouette"]) )

Number of counties in each cluster: 
{2: 1223, 3: 659, 0: 646, 1: 611}
Average cluster silhouette: 0.5292403854142361

	State	Age.Percent 65 and Older	Age.Percent Under 18 Years	Age.Percent Under 5 Years	Education.Bachelor's Degree or Higher	Education.High School or Higher	Employment.Nonemployer Establishments	Ethnicities.American Indian and Alaska Native Alone	Ethnicities.Asian Alone	Ethnicities.Black Alone	...	Population.Population per Square Mile	Sales.Accommodation and Food Services Sales	Sales.Retail Sales	Employment.Firms.Total	Employment.Firms.Women-Owned	Employment.Firms.Men-Owned	Employment.Firms.Minority-Owned	Employment.Firms.Nonminority-Owned	Employment.Firms.Veteran-Owned	Employment.Firms.Nonveteran-Owned
County
Abbeville County	SC	22.4	19.8	4.7	15.6	81.7	1416	0.3	0.4	27.6	...	51.8	12507	91371	1450	543	689	317	1080	187	1211
Acadia Parish	LA	15.8	25.8	6.9	13.3	79.0	4533	0.4	0.3	18.0	...	94.3	52706	602739	4664	1516	2629	705	3734	388	4007
Accomack County	VA	24.6	20.7	5.6	19.5	81.5	2387	0.7	0.8	28.8	...	73.8	53568	348195	2997	802	1716	335	2560	212	2536
Ada County	ID	14.9	23.2	5.6	38.5	95.2	41464	0.8	2.7	1.4	...	372.8	763099	5766679	41789	14661	19409	3099	36701	3803	35132
Adair County	IA	23.0	21.8	5.6	18.5	94.2	609	0.3	0.5	0.6	...	13.5	-1	63002	914	304	499	0	861	185	679

	Age.Percent 65 and Older	Age.Percent Under 18 Years	Age.Percent Under 5 Years	Education.Bachelor's Degree or Higher	Education.High School or Higher	Employment.Nonemployer Establishments	Ethnicities.American Indian and Alaska Native Alone	Ethnicities.Asian Alone	Ethnicities.Black Alone	Ethnicities.Hispanic or Latino	...	Population.Population per Square Mile	Sales.Accommodation and Food Services Sales	Sales.Retail Sales	Employment.Firms.Total	Employment.Firms.Women-Owned	Employment.Firms.Men-Owned	Employment.Firms.Minority-Owned	Employment.Firms.Nonminority-Owned	Employment.Firms.Veteran-Owned	Employment.Firms.Nonveteran-Owned
County
Abbeville County, SC	0.549196	-0.609584	-0.949522	-0.666870	-0.839228	-0.206578	-0.271796	-0.397138	1.262073	-0.589303	...	-0.120312	-0.210169	-0.279610	-0.221310	-0.201403	-0.227746	-0.129359	-0.284525	-0.252945	-0.213273
Acadia Parish, LA	-0.823704	1.120198	1.039138	-0.907208	-1.270673	-0.113855	-0.258327	-0.430981	0.597802	-0.502787	...	-0.095677	-0.169261	-0.165286	-0.127290	-0.126369	-0.120256	-0.106743	-0.134957	-0.171553	-0.121967
Accomack County, VA	1.006829	-0.350117	-0.135979	-0.259340	-0.871187	-0.177693	-0.217920	-0.261765	1.345107	-0.048580	...	-0.107560	-0.168384	-0.222193	-0.176055	-0.181430	-0.170843	-0.128309	-0.201119	-0.242822	-0.170004
Ada County, ID	-1.010918	0.370626	-0.135979	1.726063	1.317995	0.984758	-0.204451	0.381257	-0.550832	-0.091837	...	0.065753	0.553653	0.989189	0.958739	0.887328	0.809485	0.032794	1.722918	1.211310	0.894455
Adair County, IA	0.674005	-0.032990	-0.135979	-0.363835	1.158201	-0.230585	-0.271796	-0.363295	-0.606188	-0.524416	...	-0.142513	-0.222897	-0.285953	-0.236990	-0.219834	-0.238274	-0.147835	-0.296867	-0.253755	-0.230646

	countyname	fipscounty	neighborname	fipsneighbor
0	Autauga County, AL	1001	Autauga County, AL	1001
1	Autauga County, AL	1001	Chilton County, AL	1021
2	Autauga County, AL	1001	Dallas County, AL	1047
3	Autauga County, AL	1001	Elmore County, AL	1051
4	Autauga County, AL	1001	Lowndes County, AL	1085

	from	to	weight
0	Autauga County, AL	Chilton County, AL	2.906492
1	Autauga County, AL	Dallas County, AL	6.358240
2	Autauga County, AL	Elmore County, AL	0.964755
3	Autauga County, AL	Lowndes County, AL	6.637880
4	Autauga County, AL	Montgomery County, AL	5.984465

	0
County
Abbeville County, SC	0
Acadia Parish, LA	0
Accomack County, VA	0
Ada County, ID	0
Adair County, IA	0

On the Validity of Rural-Urban Polarization by Unsupervised and Network Clustering of Counties in the United States

Andrew Smith, May 22nd, 2023

Introduction¶

Abstract¶

Pre-Processing¶

County Statistics¶

County Network¶

Results¶

Unsupervised Clustering¶

K-Means¶

Agglomeration¶

Expectation-Maximization¶

Network Analysis¶

Regionization¶

Conclusions and Discussion¶

References¶

	Age.Percent 65 and Older	Age.Percent Under 18 Years	Age.Percent Under 5 Years	Education.Bachelor's Degree or Higher	Education.High School or Higher	Employment.Nonemployer Establishments	Ethnicities.American Indian and Alaska Native Alone	Ethnicities.Asian Alone	Ethnicities.Black Alone	Ethnicities.Hispanic or Latino	...	Population.Population per Square Mile	Sales.Accommodation and Food Services Sales	Sales.Retail Sales	Employment.Firms.Total	Employment.Firms.Women-Owned	Employment.Firms.Men-Owned	Employment.Firms.Minority-Owned	Employment.Firms.Nonminority-Owned	Employment.Firms.Veteran-Owned	Employment.Firms.Nonveteran-Owned
County
Harford County, MD	16.6	22.2	5.6	36.7	92.7	17530	0.3	3.1	14.8	4.8	...	560.1	390891	3792912	20330	7328	10727	3134	16495	2473	16889

	k	Linkage	Avg. Silhouette
3	2	ward	0.496070
0	2	single	0.475134
1	2	complete	0.475134
2	2	average	0.475134
6	3	average	0.405241
5	3	complete	0.382642
10	4	average	0.365941
4	3	single	0.295068
9	4	complete	0.285835
8	4	single	0.221380
7	3	ward	0.199837
11	4	ward	0.173677

	k	Covariance	Avg. Silhouette
1	2	tied	0.407277
5	3	tied	0.254437
9	4	tied	0.165664
3	2	spherical	0.159767
4	3	full	0.152434
0	2	full	0.145999
2	2	diag	0.133524
8	4	full	0.072426
7	3	spherical	0.018805
11	4	spherical	0.012627
10	4	diag	0.002215
6	3	diag	-0.030400

	0	1	2
County
Abbeville County, SC	1.0	5.604427e-225	1.607282e-21
Acadia Parish, LA	1.0	1.501533e-229	9.592040e-21
Accomack County, VA	1.0	6.369716e-229	1.294897e-22
Ada County, ID	1.0	4.020907e-200	2.160049e-16
Adair County, IA	1.0	3.814114e-226	1.327451e-20

	Clusters	Neighbors	Labels	Avg Silhouette
4	2	4	kmeans	0.529240
5	2	4	discretize	0.529240
10	2	7	kmeans	0.458866
6	2	5	kmeans	0.458866
8	2	6	kmeans	0.458866
20	3	4	kmeans	0.397198
3	2	3	discretize	0.379289
19	3	3	discretize	0.340084
29	3	8	discretize	0.312214
36	4	4	kmeans	0.309854

	Method	Clusters	Linkage	Covariance	Labelling	Neighbors	Avg. Silhouette
0	Spectral	2	-	-	k-Means	4	0.529240
1	Agglomerative	2	Ward	-	-	-	0.496070
2	k-Means	2	-	-	-	-	0.430599
3	Expectation-Maximization	2	-	Tied	-	-	0.407277
4	Agglomerative	3	Average	-	-	-	0.405241
5	Spectral	3	-	-	k-Means	4	0.397198
6	Agglomerative	4	Average	-	-	-	0.365941
7	Spectral	4	-	-	k-Means	4	0.309854
8	Expectation-Maximization	3	-	Tied	-	-	0.254437
9	k-Means	3	-	-	-	-	0.238186
10	Spectral (Regional)	4	-	-	k-Means	784	0.056950