##### Akeem Wells ( ajw3rg@virginia.edu )
##### DS 5001
##### 10 May 2021
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from scipy.linalg import norm
import plotly_express as px
import seaborn as sns
sns.set(style='ticks')
%matplotlib inline
OHCO = ["song_id"]
TFIDF = pd.read_csv('TFIDF.csv').set_index(OHCO)
LIB = pd.read_csv('LIB.csv').set_index('song_id')
VOCAB = pd.read_csv('VOCAB2.csv').set_index('term_id')
TFIDF.head()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 12361 | 12362 | 12364 | 12366 | 12367 | 12373 | 12377 | 12378 | 12380 | 12381 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
song_id | |||||||||||||||||||||
1001 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1002 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1003 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1004 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
1005 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
5 rows × 12337 columns
LIB.head()
title | artist | year | song_file | genre | |
---|---|---|---|---|---|
song_id | |||||
1001 | justdance | ladygaga | 2009 | data/2009/pop/Just_Dance---lady_gaga_.txt | pop |
1002 | mylifewouldsuckwithoutyou | kellyclarkson | 2009 | data/2009/pop/My_Life_Would_Suck_Without_You--... | pop |
1003 | idonothookup | kellyclarkson | 2009 | data/2009/pop/I_Do_Not_Hook_Up---kelly_clarkso... | pop |
1004 | paparazzi | ladygaga | 2009 | data/2009/pop/Paparazzi---lady_gaga.txt | pop |
1005 | wakingupinvegas | katyperry | 2009 | data/2009/pop/Waking_Up_In_Vegas---katy_perry.txt | pop |
VOCAB.head()
term_rank | term_str | n | num | stop | p_stem | pos_max | tfidf_mean | tfidf_sum | tfidf_median | tfidf_max | |
---|---|---|---|---|---|---|---|---|---|---|---|
term_id | |||||||||||
1 | 10407 | 01 | 1 | 1 | 0 | 01 | CD | 0.005017 | 0.005017 | 0.005017 | 0.005017 |
2 | 7594 | 06 | 1 | 1 | 0 | 06 | CD | 0.005017 | 0.005017 | 0.005017 | 0.005017 |
3 | 7531 | 082 | 1 | 1 | 0 | 082 | CD | 0.006174 | 0.006174 | 0.006174 | 0.006174 |
4 | 7523 | 092 | 1 | 1 | 0 | 092 | CD | 0.006515 | 0.006515 | 0.006515 | 0.006515 |
5 | 1330 | 1 | 15 | 1 | 0 | 1 | CD | 0.016533 | 0.132266 | 0.004439 | 0.088778 |
TFIDF = TFIDF.apply(lambda x: x / norm(x), 1)
We do not normalize variance, which we would normally do, such as with data containing divergent units of measure. \ This is because to do so would exaggerate the importance of rare words (see Ng, 2008: 6m40s — 8m00s).
Note that we are taking the column-wise means -- the means for the term vectors. \ We don't really need to do this. But it is typical for PCA. \ NOTE: SOme argue that centering alters the cosine angles.
TFIDF = TFIDF - TFIDF.mean()
We could compute this directly, but we use the built in Pandas method here.
# COV = TFIDF.T.dot(TFIDF) / (TFIDF.shape[0] - 1)
# COV_b = TFIDF_b.T.dot(TFIDF_b) / (TFIDF.shape[0] - 1)
COV = TFIDF.cov()
COV.head()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | ... | 12361 | 12362 | 12364 | 12366 | 12367 | 12373 | 12377 | 12378 | 12380 | 12381 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 8.562018e-06 | 8.562018e-06 | -9.226316e-09 | -1.029379e-08 | -1.247235e-07 | -7.736314e-08 | -6.021163e-08 | -3.354429e-08 | -1.912432e-08 | -8.767372e-09 | ... | -5.019753e-09 | -1.003951e-08 | -4.015803e-08 | -5.019753e-09 | -1.003951e-08 | -5.019753e-09 | -5.019753e-09 | -5.019753e-09 | -5.019753e-09 | -1.003951e-08 |
2 | 8.562018e-06 | 8.562018e-06 | -9.226316e-09 | -1.029379e-08 | -1.247235e-07 | -7.736314e-08 | -6.021163e-08 | -3.354429e-08 | -1.912432e-08 | -8.767372e-09 | ... | -5.019753e-09 | -1.003951e-08 | -4.015803e-08 | -5.019753e-09 | -1.003951e-08 | -5.019753e-09 | -5.019753e-09 | -5.019753e-09 | -5.019753e-09 | -1.003951e-08 |
3 | -9.226316e-09 | -9.226316e-09 | 7.699204e-06 | -9.761354e-09 | -1.182723e-07 | -7.336162e-08 | -5.709726e-08 | -3.180925e-08 | -1.813513e-08 | -8.313890e-09 | ... | -4.760113e-09 | -9.520225e-09 | -3.808090e-08 | -4.760113e-09 | -9.520225e-09 | -4.760113e-09 | -4.760113e-09 | -4.760113e-09 | -4.760113e-09 | -9.520225e-09 |
4 | -1.029379e-08 | -1.029379e-08 | -9.761354e-09 | 9.583843e-06 | -1.319563e-07 | -8.184947e-08 | -6.370334e-08 | -3.548954e-08 | -2.023335e-08 | -9.275796e-09 | ... | -5.310852e-09 | -1.062170e-08 | -4.248681e-08 | -5.310852e-09 | -1.062170e-08 | -5.310852e-09 | -5.310852e-09 | -5.310852e-09 | -5.310852e-09 | -1.062170e-08 |
5 | -1.247235e-07 | -1.247235e-07 | -1.182723e-07 | -1.319563e-07 | 4.905735e-04 | 3.539238e-06 | -7.718541e-07 | -4.300049e-07 | -2.451550e-07 | -1.123891e-07 | ... | -6.434832e-08 | -1.286966e-07 | -5.147865e-07 | -6.434832e-08 | -1.286966e-07 | -6.434832e-08 | -6.434832e-08 | -6.434832e-08 | -6.434832e-08 | -1.286966e-07 |
5 rows × 12337 columns
COV.iloc[:5,:10].style.background_gradient()
1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | |
---|---|---|---|---|---|---|---|---|---|---|
1 | 0.000009 | 0.000009 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 |
2 | 0.000009 | 0.000009 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 |
3 | -0.000000 | -0.000000 | 0.000008 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 |
4 | -0.000000 | -0.000000 | -0.000000 | 0.000010 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | -0.000000 |
5 | -0.000000 | -0.000000 | -0.000000 | -0.000000 | 0.000491 | 0.000004 | -0.000001 | -0.000000 | -0.000000 | -0.000000 |
There a at least three options to choose from. We go with SciPy's Hermitian Eigendecomposition \
method eigh()
, since our covarience matrix is symmetric.
# from numpy.linalg import eig
# from scipy.linalg import eig
from scipy.linalg import eigh as eig
eig_vals, eig_vecs = eig(COV)
TERM_IDX = COV.index # We could use other tables as well, e.g. TFIDF_b, TFIDF, or COV
# TERM_IDX
EIG_VEC = pd.DataFrame(eig_vecs, index=TERM_IDX, columns=TERM_IDX)
EIG_VAL = pd.DataFrame(eig_vals, index=TERM_IDX, columns=['eig_val'])
EIG_VAL.index.name = 'term_id'
Next, we associate each eigenvalue with its corresponding column in the eigenvalue matrix. \
This is why we transpose the EIG_VEC
dataframe.
EIG_PAIRS = EIG_VAL.join(EIG_VEC.T)
Next, we sort in descending order and pick the top K (=10).
We might have usd this value to sort our components.
EIG_PAIRS['exp_var'] = np.round((EIG_PAIRS.eig_val / EIG_PAIRS.eig_val.sum()) * 100, 2)
EIG_PAIRS.exp_var.sort_values(ascending=False).head().plot.bar(rot=45)
<AxesSubplot:xlabel='term_id'>
We pick these based on explained variance.
TOPS = EIG_PAIRS.sort_values('exp_var', ascending=False).head(10).reset_index(drop=True)
TOPS.index.name = 'comp_id'
TOPS.index = ["PC{}".format(i) for i in TOPS.index.tolist()]
Loadings sow the contribution of each term to the component. \ We'll just look at the topi 10 words for the first two components in the Book version.
LOADINGS = TOPS[TERM_IDX].T
LOADINGS.index.name = 'term_id'
LOADINGS.head().style.background_gradient()
PC0 | PC1 | PC2 | PC3 | PC4 | PC5 | PC6 | PC7 | PC8 | PC9 | |
---|---|---|---|---|---|---|---|---|---|---|
term_id | ||||||||||
1 | 0.001361 | -0.000645 | 0.001173 | 0.001392 | -0.000845 | 0.000249 | -0.001201 | -0.000194 | -0.000417 | 0.000257 |
2 | 0.001361 | -0.000645 | 0.001173 | 0.001392 | -0.000845 | 0.000249 | -0.001201 | -0.000194 | -0.000417 | 0.000257 |
3 | 0.000896 | 0.000091 | -0.000101 | 0.001193 | -0.000333 | 0.000890 | -0.000367 | 0.000526 | -0.000120 | -0.000501 |
4 | 0.003123 | 0.000147 | 0.000070 | -0.002357 | 0.001565 | -0.000814 | -0.000118 | 0.001415 | 0.001941 | 0.001476 |
5 | 0.001986 | 0.005860 | -0.009990 | 0.005153 | 0.009717 | -0.002846 | 0.001328 | 0.010538 | -0.013890 | -0.006790 |
LOADINGS['term_str'] = LOADINGS.apply(lambda x: VOCAB.loc[int(x.name)].term_str, 1)
lb0_pos = LOADINGS.sort_values('PC0', ascending=True).head(10).term_str.str.cat(sep=' ')
lb0_neg = LOADINGS.sort_values('PC0', ascending=False).head(10).term_str.str.cat(sep=' ')
lb1_pos = LOADINGS.sort_values('PC1', ascending=True).head(10).term_str.str.cat(sep=' ')
lb1_neg = LOADINGS.sort_values('PC1', ascending=False).head(10).term_str.str.cat(sep=' ')
print('Books PC0+', lb0_pos)
print('Books PC0-', lb0_neg)
print('Books PC1+', lb1_pos)
print('Books PC1-', lb1_neg)
Books PC0+ youre were na love gonna heart tonight are ill oh Books PC0- nigga she ayy bitch niggas her fuck shit he money Books PC1+ she her love ill heart ever was one would were Books PC1- na hey shake boyfriend woo uh da dance boom girlfriend