## CONVERT TO CATEGORY DATATYPE

# Load essential libraries
import pandas as pd
import numpy as np
import seaborn as sns 
import matplotlib.pyplot as plt

# Read csv file
df = pd.read_csv('../data/cleaned_wf_demo.csv', index_col=0)

# Convert relevant columns to categorical datatype
df['ethnicity'] = df['ethnicity'].astype('category').cat.codes
df['gender'] = df['gender'].astype('category').cat.codes
df['job_category'] = df['job_category'].astype('category').cat.codes

# Confirm df
df.head(5)


## STANDARDIZE

# Load libraries for standardization
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.compose import ColumnTransformer

# Drop company column from df for X, as well as gender and ethnicity columns with strings
X = df.copy()
X = X.drop(['company'], axis=1)

# Scale continuous variables in X
X['salary'] = StandardScaler().fit_transform(np.array(X['salary']).reshape(-1,1))
X['count'] = StandardScaler().fit_transform(np.array(X['count']).reshape(-1,1))
X['inflow'] = StandardScaler().fit_transform(np.array(X['inflow']).reshape(-1,1))
X['outflow'] = StandardScaler().fit_transform(np.array(X['outflow']).reshape(-1,1))

# Save standardized X to new dataset
X.to_csv('../data/Workforce_X_Standardized.csv')

# Read the standardized X dataset
X = pd.read_csv('../data/Workforce_X_Standardized.csv', index_col=0)
X.head()


# Identify feature and target columns 
feature_cols = ['salary', 'inflow', 'outflow', 'count']
# print(feature_cols)

# Display heat-map for the correlation matrix
# Use X[feature_cols] instead of df[feature_cols] since we want to reference back to X
corr = X[feature_cols].corr(); 
print(corr.shape)
sns.set_theme(style="white")
f, ax = plt.subplots(figsize=(5,5))  # Set up the matplotlib figure
cmap = sns.diverging_palette(0, 255, sep=15, n=16, as_cmap=True) # Generate a custom diverging colormap
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr,  cmap=cmap, vmin=-1, vmax=1, center=0,
        square=True, linewidths=.5, cbar_kws={"shrink": .5})
# cbar_kws --> removing the shrink will space them out better

plt.xticks(rotation=45)

plt.show()

(4, 4)


##### K-MEANS CLUSTERING #####

# Import relevant libraries for K-Means clustering
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay, accuracy_score

# Evaluate the optimal number of clusters using the elbow method 
distortions = []
inertias = []
k = 20

for k in range(1,k):
    kmeanModel = KMeans(n_clusters=k, init='k-means++', random_state=0)
    kmeanModel.fit(X)

    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
    inertias.append(kmeanModel.inertia_)
    evaluation = pd.DataFrame.from_records({'Cluster': np.arange(1, k+1), 'Distortion': distortions, 'Inertia': inertias})

evaluation

# Plot distortion and inertia elbow plot for kmeans
evaluation.plot.line(x = 'Cluster', subplots=True, figsize=(10, 5))

array([<AxesSubplot:xlabel='Cluster'>, <AxesSubplot:xlabel='Cluster'>],
      dtype=object)


# Choose k=6 as the optimal number of clusters

# Show various lengths
sns.set_theme(style = 'white', palette='Set1')
bestK = KMeans(n_clusters=6, init='k-means++', random_state=0, max_iter=500)
labels4 = bestK.fit_predict(X)
X['klabels'] = labels4

# Plot actual clusters and K-means clusters
fig, ax = plt.subplots(1, 2, figsize=(10, 5), dpi = 150)
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'company', data = df, ax = ax[0])
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'klabels', data = X, ax = ax[1])

<AxesSubplot:xlabel='salary', ylabel='outflow'>


###### DBSCAN ######

# Import necessary library for DBSCAN clustering
# . plot the number of clusters vs the silhouette score. Suggest the optimal number of clusters based on the plot.
from sklearn.cluster import DBSCAN

# Use the eps and min_samples parameters to find the optimal number of clusters
model = DBSCAN(eps = 8, min_samples = 2).fit(X)
labels_DB = model.labels_
X['DBlabels'] = labels_DB

# Plot actual labels vs DBSCAN labels
fig, ax = plt.subplots(1, 2, figsize=(10, 5), dpi = 150)
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'company', data = df, ax = ax[0])
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'DBlabels', data = X, ax = ax[1])

<AxesSubplot:xlabel='salary', ylabel='outflow'>


###### HIERARCHICAL CLUSTERING ######

# Perform Agglomerative Clustering
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.cluster import AgglomerativeClustering

model = AgglomerativeClustering( ).fit(X)
HClabels = model.labels_
X['HClabels'] = HClabels


from matplotlib import pyplot as plt

# Plot
fig, ax = plt.subplots(1, 2, figsize=(10, 5), dpi = 150)
sns.scatterplot(x='salary', y='outflow', hue='company', data=df, ax=ax[0])
sns.scatterplot(x='salary', y='outflow', hue='HClabels', data=X, ax=ax[1])

<AxesSubplot:xlabel='salary', ylabel='outflow'>


# create linkage for agglomerative clustering, and the dendrogram for the linkage. Suggest the optimal number of clusters based on the dendrogram.
plt.figure(figsize=(10, 10))
Z = linkage(X, method='ward')
dend = dendrogram(Z)
plt.axhline(y=300, color='r', label='21')

<matplotlib.lines.Line2D at 0x7f98f0d5fd30>


# Generate plot structure
fig, ax = plt.subplots(2, 2, figsize=(10, 10), dpi = 150)

# True labels
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'company', data = df, ax = ax[0,0])
ax[0,0].set_title('True Labels')

# K-means
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'klabels', data = X, ax = ax[0,1])
ax[0,1].set_title('K-means')

# DBSCAN
sns.scatterplot(x = 'salary', y = 'outflow', hue = 'DBlabels', data = X, ax = ax[1,0])
ax[1,0].set_title('DBSCAN')

# Hierarchical
sns.scatterplot(x='salary', y='outflow', hue='HClabels', data= X, ax=ax[1,1])
ax[1,1].set_title('Hierarchical')

Text(0.5, 1.0, 'Hierarchical')

	company	seniority	job_category	gender	count	inflow	outflow	salary
1	Home Depot	4	0	0	6.649361	0.027259	0.020727	6.296236e+05
2	Home Depot	3	7	1	0.114460	0.000265	0.000297	8.108699e+03
3	Home Depot	5	6	1	0.005991	0.001000	0.000000	5.206698e+02
4	Home Depot	1	1	1	192.650237	10.110865	6.249455	1.124800e+07
5	Home Depot	3	1	0	84.062919	1.707721	1.253758	8.842880e+06

	seniority	job_category	gender	count	inflow	outflow	salary
1	4	0	0	-0.155382	-0.136760	-0.144712	-0.214896
2	3	7	1	-0.164201	-0.138127	-0.146028	-0.228569
3	5	6	1	-0.164347	-0.138089	-0.146047	-0.228736
4	1	1	1	0.095629	0.373655	0.256436	0.018696
5	3	1	0	-0.050911	-0.051698	-0.065302	-0.034214

Clustering on Workforce Diversity Outcomes Record Dataset¶

1. Introduction¶

2. Theory¶

Clustering Method 1: K-means¶

Clustering Method 2: DBSCAN¶

Clustering Method 3: Hierarchical Clustering¶

3. Methods¶

3.1 Data Selection¶

3.2 Feature Selection¶

3.3 Hyperparameter Tuning and Final Results for each Clustering Method¶

3.3.1 K-means Clustering¶

3.3.2 DBSCAN¶

3.3.3 Hierarchical Clustering¶

4. Results¶

5. Conclusions¶