##### Akeem Wells ( ajw3rg@virginia.edu )
##### DS 5001
##### 10 May 2021


import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import norm

import plotly_express as px
import seaborn as sns

sns.set(style='ticks')
%matplotlib inline


OHCO = ["song_id"]


TFIDF = pd.read_csv('TFIDF.csv').set_index(OHCO)
LIB = pd.read_csv('LIB.csv').set_index('song_id')
VOCAB = pd.read_csv('VOCAB2.csv').set_index('term_id')


TFIDF.head()


LIB.head()


VOCAB.head()


TFIDF = TFIDF.apply(lambda x: x / norm(x), 1)


TFIDF = TFIDF - TFIDF.mean()


# COV = TFIDF.T.dot(TFIDF) / (TFIDF.shape[0] - 1)
# COV_b = TFIDF_b.T.dot(TFIDF_b) / (TFIDF.shape[0] - 1)


COV = TFIDF.cov()


COV.head()


COV.iloc[:5,:10].style.background_gradient()


# from numpy.linalg import eig
# from scipy.linalg import eig
from scipy.linalg import eigh as eig


eig_vals, eig_vecs = eig(COV)


TERM_IDX = COV.index # We could use other tables as well, e.g. TFIDF_b, TFIDF, or COV


# TERM_IDX


EIG_VEC = pd.DataFrame(eig_vecs, index=TERM_IDX, columns=TERM_IDX)


EIG_VAL = pd.DataFrame(eig_vals, index=TERM_IDX, columns=['eig_val'])
EIG_VAL.index.name = 'term_id'


EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)


EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)


EIG_PAIRS.exp_var.sort_values(ascending=False).head().plot.bar(rot=45)

<AxesSubplot:xlabel='term_id'>


TOPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(10).reset_index(drop=True)
TOPS.index.name = 'comp_id'
TOPS.index = ["PC{}".format(i) for i in TOPS.index.tolist()]


LOADINGS = TOPS[TERM_IDX].T
LOADINGS.index.name = 'term_id'


LOADINGS.head().style.background_gradient()


LOADINGS['term_str'] = LOADINGS.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)


lb0_pos = LOADINGS.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
lb0_neg = LOADINGS.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
lb1_pos = LOADINGS.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
lb1_neg = LOADINGS.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')


print('Books PC0+', lb0_pos)
print('Books PC0-', lb0_neg)
print('Books PC1+', lb1_pos)
print('Books PC1-', lb1_neg)

Books PC0+ youre were na love gonna heart tonight are ill oh
Books PC0- nigga she ayy bitch niggas her fuck shit he money
Books PC1+ she her love ill heart ever was one would were
Books PC1- na hey shake boyfriend woo uh da dance boom girlfriend

	1	2	3	4	5	6	7	8	9	10	...	12361	12362	12364	12366	12367	12373	12377	12378	12380	12381
song_id
1001	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1002	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1003	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1004	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1005	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	title	artist	year	song_file	genre
song_id
1001	justdance	ladygaga	2009	data/2009/pop/Just_Dance---lady_gaga_.txt	pop
1002	mylifewouldsuckwithoutyou	kellyclarkson	2009	data/2009/pop/My_Life_Would_Suck_Without_You--...	pop
1003	idonothookup	kellyclarkson	2009	data/2009/pop/I_Do_Not_Hook_Up---kelly_clarkso...	pop
1004	paparazzi	ladygaga	2009	data/2009/pop/Paparazzi---lady_gaga.txt	pop
1005	wakingupinvegas	katyperry	2009	data/2009/pop/Waking_Up_In_Vegas---katy_perry.txt	pop

	term_rank	term_str	n	num	stop	p_stem	pos_max	tfidf_mean	tfidf_sum	tfidf_median	tfidf_max
term_id
1	10407	01	1	1	0	01	CD	0.005017	0.005017	0.005017	0.005017
2	7594	06	1	1	0	06	CD	0.005017	0.005017	0.005017	0.005017
3	7531	082	1	1	0	082	CD	0.006174	0.006174	0.006174	0.006174
4	7523	092	1	1	0	092	CD	0.006515	0.006515	0.006515	0.006515
5	1330	1	15	1	0	1	CD	0.016533	0.132266	0.004439	0.088778

	1	2	3	4	5	6	7	8	9	10	...	12361	12362	12364	12366	12367	12373	12377	12378	12380	12381
1	8.562018e-06	8.562018e-06	-9.226316e-09	-1.029379e-08	-1.247235e-07	-7.736314e-08	-6.021163e-08	-3.354429e-08	-1.912432e-08	-8.767372e-09	...	-5.019753e-09	-1.003951e-08	-4.015803e-08	-5.019753e-09	-1.003951e-08	-5.019753e-09	-5.019753e-09	-5.019753e-09	-5.019753e-09	-1.003951e-08
2	8.562018e-06	8.562018e-06	-9.226316e-09	-1.029379e-08	-1.247235e-07	-7.736314e-08	-6.021163e-08	-3.354429e-08	-1.912432e-08	-8.767372e-09	...	-5.019753e-09	-1.003951e-08	-4.015803e-08	-5.019753e-09	-1.003951e-08	-5.019753e-09	-5.019753e-09	-5.019753e-09	-5.019753e-09	-1.003951e-08
3	-9.226316e-09	-9.226316e-09	7.699204e-06	-9.761354e-09	-1.182723e-07	-7.336162e-08	-5.709726e-08	-3.180925e-08	-1.813513e-08	-8.313890e-09	...	-4.760113e-09	-9.520225e-09	-3.808090e-08	-4.760113e-09	-9.520225e-09	-4.760113e-09	-4.760113e-09	-4.760113e-09	-4.760113e-09	-9.520225e-09
4	-1.029379e-08	-1.029379e-08	-9.761354e-09	9.583843e-06	-1.319563e-07	-8.184947e-08	-6.370334e-08	-3.548954e-08	-2.023335e-08	-9.275796e-09	...	-5.310852e-09	-1.062170e-08	-4.248681e-08	-5.310852e-09	-1.062170e-08	-5.310852e-09	-5.310852e-09	-5.310852e-09	-5.310852e-09	-1.062170e-08
5	-1.247235e-07	-1.247235e-07	-1.182723e-07	-1.319563e-07	4.905735e-04	3.539238e-06	-7.718541e-07	-4.300049e-07	-2.451550e-07	-1.123891e-07	...	-6.434832e-08	-1.286966e-07	-5.147865e-07	-6.434832e-08	-1.286966e-07	-6.434832e-08	-6.434832e-08	-6.434832e-08	-6.434832e-08	-1.286966e-07

	1	2	3	4	5	6	7	8	9	10
1	0.000009	0.000009	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000
2	0.000009	0.000009	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000
3	-0.000000	-0.000000	0.000008	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000
4	-0.000000	-0.000000	-0.000000	0.000010	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000	-0.000000
5	-0.000000	-0.000000	-0.000000	-0.000000	0.000491	0.000004	-0.000001	-0.000000	-0.000000	-0.000000

Preprocess the TFIDF Matrices¶

Normalize doc vector lengths¶

Normalize term vector variance¶

Center the word vectors¶

Compute Covariance Matrix¶

Decompose the Matrix¶

Convert eigen data to dataframes¶

Select Principal Components¶

Combine eigenvalues and eignvectors¶

Compute and Show Explained Variance¶

Pick Top K (10) Components¶

Show Loadings¶

	PC0	PC1	PC2	PC3	PC4	PC5	PC6	PC7	PC8	PC9
term_id
1	0.001361	-0.000645	0.001173	0.001392	-0.000845	0.000249	-0.001201	-0.000194	-0.000417	0.000257
2	0.001361	-0.000645	0.001173	0.001392	-0.000845	0.000249	-0.001201	-0.000194	-0.000417	0.000257
3	0.000896	0.000091	-0.000101	0.001193	-0.000333	0.000890	-0.000367	0.000526	-0.000120	-0.000501
4	0.003123	0.000147	0.000070	-0.002357	0.001565	-0.000814	-0.000118	0.001415	0.001941	0.001476
5	0.001986	0.005860	-0.009990	0.005153	0.009717	-0.002846	0.001328	0.010538	-0.013890	-0.006790