Accounting Research

Factorm Clustering Analysis of Financial Ratios

Notebook Configurations

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.decomposition import PCA,FactorAnalysis
from sklearn.cluster import KMeans
from sklearn import preprocessing, linear_model, decomposition
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

Data Preparation

data = pd.read_csv("data/US_GAAP_2011.csv")
factors = pd.read_csv("data/US_factors.csv")
data.columns = list(range(1,51))
corr = data.corr()
dec_corr = corr.sort_index(ascending=False)
data.head()

	1	2	3	4	5	6	7	8	9	10	...	41	42	43	44	45	46	47	48	49	50
0	42.129630	12.786144	285.578828	7.104444	1.694915	31.096376	0.690371	180.876494	3.065703	6.317104	...	56.277533	83.527454	4.418236	0.613356	2.567817	163.037461	32.963063	47.659531	19.164191	2.511409
1	24.587598	1.178338	568.911048	77.117896	8.667923	5.794678	0.569748	154.674448	13.407062	16.147590	...	124.490521	242.278902	5.072821	1.869247	1.076372	53.497464	23.229518	73.672075	69.513237	11.780029
2	23.392283	0.000000	2769.314642	307.476635	11.163264	14.465474	0.309314	27.811550	3.104677	-15.336764	...	-117.850638	-94.089609	10.003040	15.281675	1.050158	6.543785	22.089012	44.521249	-69.431644	-3.168224
3	18.273444	4.924854	508.487363	17.917767	2.600673	10.711415	0.649931	84.000000	2.184566	9.080457	...	131.706183	-2638.888889	6.922685	2.171528	1.163006	46.050520	33.603438	58.883910	27.022403	8.526992
4	39.768908	0.000000	2999.940770	2.645607	0.080056	61.602539	0.630420	91.044776	0.072887	6.115353	...	3924.590164	-1340.000000	9.222299	0.356556	2.999199	280.461243	17.428397	10.708899	35.088441	7.582222

5 rows × 50 columns

Data Exploration - Visualization - Correlations

plt.subplots(figsize=(20, 20))
sns.heatmap(corr[:25][corr.columns[:25]], vmax=.9, square=True)
plt.show()

plt.subplots(figsize=(20, 20))
sns.heatmap(corr[25:][corr.columns[25:]], vmax=.9, square=True)
plt.show()

plt.subplots(figsize=(20, 20))
ax = sns.heatmap(dec_corr[:26][dec_corr.columns[:25]], vmax=.9, square=True)
ax.invert_yaxis()
plt.show()

plt.subplots(figsize=(20, 20))
ax = sns.heatmap(dec_corr[24:][dec_corr.columns[25:]], vmax=.9, square=True)
ax.invert_yaxis()
plt.show()

Complete Correlation Matrix

plt.subplots(figsize=(20, 20))
sns.heatmap(corr, vmax=.9, square=True)
plt.show()

Features Reduction - Kmeans Clustering

Transformations - Normalize, PCA Reduction

df = preprocessing.normalize(data.as_matrix(), norm='l1')
reduced_2d = PCA(n_components=2).fit_transform(df)

Unsupervised Dimensionality Reduction

logistic = linear_model.LogisticRegression()
pca = decomposition.PCA()
pipe = Pipeline(steps=[('pca', pca), ('logistic', logistic)])

pca.fit(df)
plt.figure(figsize=(15, 15))
plt.clf()
plt.axes([.2, .2, .7, .7])
plt.plot(pca.explained_variance_, linewidth=2)
plt.axis('tight')
plt.xlabel('n_components')
plt.ylabel('explained_variance_')
plt.show()

def compute_scores(X,n_components):
    pca = PCA(svd_solver='full')
    fa = FactorAnalysis()

    pca_scores, fa_scores = {}, {}
    for n in n_components:
        pca.n_components = n
        fa.n_components = n
        pca_scores[n] = np.mean(cross_val_score(pca, X))
        fa_scores[n] = np.mean(cross_val_score(fa, X))

    return {"pca_scores": pca_scores, "fa_scores": fa_scores}

n_components = np.arange(10, 15)  # options for n_components
scores = compute_scores(df,n_components)
scores = pd.DataFrame(scores)
scores

	fa_scores	pca_scores
10	182.612789	126.891099
11	183.098101	125.443178
12	186.432887	124.676660
13	180.815650	123.652364
14	183.647193	123.423738

Complete Correlation Matrix - Normalize Data

plt.subplots(figsize=(20, 20))
sns.heatmap(pd.DataFrame(df).corr(), vmax=.9, square=True)
plt.show()

# scree plot

K-Means Model: 10 - PCA Components: 2

kmeans_2d = KMeans(init='k-means++', n_clusters=10, n_init=10)
kmeans_2d.fit(reduced_2d)

reduced_2d.shape

kmeans_2d.labels_.shape

Visualize PCA Reduced Data

plt.subplots(figsize=(20, 20))
plt.plot(reduced_2d[:, 0], reduced_2d[:, 1], 'k.', markersize=6)
plt.show()

K-means Clustering - Visualization

# Size of the mesh.
h = .001

# Plot the decision boundary.
x_min, x_max = - 1, 1
y_min, y_max = - 1, 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = kmeans_2d.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)

plt.subplots(figsize=(20, 20))
plt.clf()

plt.imshow(Z, interpolation='nearest',
           extent=(xx.min(), xx.max(), yy.min(), yy.max()),
           cmap=plt.cm.Paired,
           aspect='auto', origin='lower')

plt.plot(reduced_2d[:, 0], reduced_2d[:, 1], 'k.', markersize=7)

# Plot the centroids as a white X
centroids = kmeans_2d.cluster_centers_
plt.scatter(centroids[:, 0], centroids[:, 1],
            marker='x', s=169, linewidths=5,
            color='w', zorder=10)

plt.title('K-means clustering on PCA-reduced data and Centroids')
plt.xlim(x_min, x_max)
plt.ylim(y_min, y_max)
plt.show()

Counter(kmeans_2d.labels_)

K-means Clustering - PCA - Cluster 10

Transformations - Normalize, PCA Reduction

df.shape

reduced_data = PCA(n_components=11).fit_transform(df)
reduced_data.shape

reduced_data = PCA(n_components=11).fit_transform(df.T)
reduced_data.shape
#reduced_data = PCA(n_components=11).fit_transform(df.T)

data_fa = FactorAnalysis(n_components=12).fit_transform(df.T)
data_fa = pd.DataFrame(data_fa)
data_fa.index = factors['label']
data_fa.head()

	0	1	2	3	4	5	6	7	8	9	10	11
label
a_1	-0.143795	-0.099797	-0.135252	0.366406	-0.242712	-0.277372	6.970977	0.261439	0.053624	-0.004625	-0.000391	-0.023752
a_2	-0.143378	-0.281464	-0.142794	-0.386311	-0.327465	-0.323751	-0.159217	0.150312	-0.436785	0.123106	0.287120	-0.356258
a_3	-0.146273	6.809076	-1.117159	-1.037797	-0.229769	-0.411944	-0.039873	-0.009395	-0.070357	0.059029	-0.066813	-0.174677
a_4	-0.143725	-0.020199	-0.174889	-0.305684	-0.334707	-0.255149	-0.173129	0.168728	0.562148	-0.715145	1.534362	4.790746
a_5	-0.143406	-0.269479	-0.142701	-0.369602	-0.324011	-0.307698	-0.155482	0.132649	-0.322360	0.043791	0.431541	0.031675

reduced_data = PCA(n_components=11).fit_transform(df.T)
reduced_data = pd.DataFrame(reduced_data)
reduced_data.index = factors['label']
reduced_data.head()

K-Means Model - Clusters 10

kmeans = KMeans(init='k-means++', n_clusters=10,n_init=10)
clusters = kmeans.fit_predict(reduced_data)
Counter(clusters)

kmeans = KMeans(init='k-means++', n_clusters=12,n_init=10)
clusters = kmeans.fit_predict(data_fa)
Counter(clusters)

Counter({0: 31,
         1: 1,
         2: 1,
         3: 1,
         4: 1,
         5: 1,
         6: 9,
         7: 1,
         8: 1,
         9: 1,
         10: 1,
         11: 1})

Labeling Factors

factors["cluster"] = clusters.tolist()
factors[factors['cluster']==6]

	factor	label	cluster
3	Depreciation and amortization to total operati...	a_4	6
11	Gross profit margin	a_12	6
21	PPE to sales	a_22	6
26	Days sales outstanding (DSO)	a_27	6
27	Days payables outstanding	a_28	6
30	Equity ratio	a_31	6
31	Total liabilities to total capital (leverage)	a_32	6
33	Leverage structure	a_34	6
46	Current liabilities to sales	a_47	6

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

readme.md

readme.md

Accounting Research

Factorm Clustering Analysis of Financial Ratios

Notebook Configurations

Data Preparation

Data Exploration - Visualization - Correlations

Complete Correlation Matrix

Features Reduction - Kmeans Clustering

Transformations - Normalize, PCA Reduction

Unsupervised Dimensionality Reduction

Complete Correlation Matrix - Normalize Data

K-Means Model: 10 - PCA Components: 2

Visualize PCA Reduced Data

K-means Clustering - Visualization

K-means Clustering - PCA - Cluster 10

Transformations - Normalize, PCA Reduction

K-Means Model - Clusters 10

Labeling Factors

Files

readme.md

Latest commit

History

readme.md

File metadata and controls

Accounting Research

Factorm Clustering Analysis of Financial Ratios

Notebook Configurations

Data Preparation

Data Exploration - Visualization - Correlations

Complete Correlation Matrix

Features Reduction - Kmeans Clustering

Transformations - Normalize, PCA Reduction

Unsupervised Dimensionality Reduction

Complete Correlation Matrix - Normalize Data

K-Means Model: 10 - PCA Components: 2

Visualize PCA Reduced Data

K-means Clustering - Visualization

K-means Clustering - PCA - Cluster 10

Transformations - Normalize, PCA Reduction

K-Means Model - Clusters 10

Labeling Factors