import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
import plotly_express as px
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
def plot_tree(tree, labels):
plt.figure()
fig, axes = plt.subplots(figsize=(5, 10))
dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
plt.tick_params(axis='both', which='major', labelsize=14)
n_terms = 4000
n_topics = 30
max_iter = 5
OHCO = ['genre', 'artist','title']
%matplotlib inline
TOKENS = pd.read_csv("TOKENS.csv")
TOKENS.head()
genre | artist | title | sent_num | token_num | pos_tuple | pos | token_str | term_str | term_id | |
---|---|---|---|---|---|---|---|---|---|---|
0 | pop | ladygaga | justdance | 0 | 0 | ('truth', 'NN') | NN | truth | truth | 11236 |
1 | pop | ladygaga | justdance | 1 | 0 | ('redone', 'NN') | NN | redone | redone | 8681 |
2 | pop | ladygaga | justdance | 2 | 0 | ('konvict', 'NN') | NN | konvict | konvict | 5937 |
3 | pop | ladygaga | justdance | 3 | 0 | ('gaga', 'NN') | NN | gaga | gaga | 4349 |
4 | pop | ladygaga | justdance | 3 | 1 | ('ohoh', 'NN') | NN | ohoh | ohoh | 7439 |
TOKENS = TOKENS.drop(['pos_tuple'],axis=1)
TOKENS.term_str = TOKENS.term_str.apply(lambda x: str(x))
PARAS = TOKENS[TOKENS.pos.str.match(r'^NNS?$')]\
.groupby(OHCO).term_str\
.apply(lambda x: ' '.join(x) )\
.to_frame()\
.rename(columns={'term_str':'lyrics'})
PARAS
lyrics | |||
---|---|---|---|
genre | artist | title | |
country | billycurrington | peoplearecrazy | man bar beers dont cares politics blonde chick... |
blakeshelton | boysroundhere | boys round beatles bocephus jukebox needle hon... | |
godgavemeyou | walking heartache mess person aint watch storm... | ||
godscountry | outside church town theres gold dirt road lot ... | ||
honeybee | girl i stuff feelings rest crazy yeah dont itl... | ||
... | ... | ... | ... |
rnbhiphop | ynwmelly | murderonmymind | ayy im studio bro yeah yeah ill yeah im studio... |
suicidal | love suicidal love i heart i ecstasy i death l... | ||
yogotti | downinthedm | i girl post dm dm eyes man i dm dm i snapchat ... | |
rakeitup | drummers fool ah strip club anthem nigga money... | ||
zayhilfigerrr | jujuonthatbeattzanthem | dance baby dance yeah oh baby grandma im god g... |
877 rows × 1 columns
We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(PARAS.lyrics)
TERMS = tfv.get_feature_names()
We run Scikit Learn's LatentDirichletAllocation algorithm and extract the THETA and PHI tables.
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)
THETA = pd.DataFrame(lda.fit_transform(tf), index=PARAS.index)
THETA.columns.name = 'topic_id'
THETA.sample(20).style.background_gradient()
topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
genre | artist | title | ||||||||||||||||||||||||||||||
country | dan+shay | 10000hours | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.139836 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.514724 | 0.000855 | 0.322364 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 | 0.000855 |
rap | cardib | bodakyellow(moneymoves) | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.010420 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.987193 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 | 0.000085 |
country | timmcgraw | humbleandkind | 0.000641 | 0.981410 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 | 0.000641 |
rap | lilbaby | wepaid | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.427410 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.225164 | 0.062251 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.000254 | 0.145263 | 0.133550 | 0.000254 | 0.000254 |
country | jasonaldean | flyoverstates | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.000521 | 0.984896 | 0.000521 |
morganwallen | morethanmyhometown | 0.000392 | 0.000392 | 0.794102 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.194918 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | |
rap | schoolboyq | studio | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.199376 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.787097 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 | 0.000483 |
pop | theveronicas | untouched | 0.000654 | 0.981046 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 | 0.000654 |
rnbhiphop | mustard | ballin | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.059768 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.932454 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 | 0.000278 |
rap | nickiminaj | superbass | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.990705 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 | 0.000321 |
rnbhiphop | aliciakeys | un-thinkable(imready) | 0.001075 | 0.668696 | 0.109250 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.193021 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 | 0.001075 |
rap | postmalone | rockstar | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.112954 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.879811 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 | 0.000258 |
rnbhiphop | rihanna | bitchbetterhavemymoney | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.818155 | 0.000256 | 0.000256 | 0.174665 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 | 0.000256 |
country | tobykeith | redsolocup | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.984409 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 | 0.000538 |
rnbhiphop | brysontiller | exchange | 0.332111 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.653959 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 | 0.000498 |
pop | katyperry | theonethatgotaway | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.975833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 | 0.000833 |
fun. | weareyoung | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.978030 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | |
rnbhiphop | ne-yo | sheknows | 0.000476 | 0.398179 | 0.000476 | 0.000476 | 0.000476 | 0.588488 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 | 0.000476 |
country | bradpaisley | then | 0.001010 | 0.970707 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 | 0.001010 |
rap | meekmill | goingbad | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.990794 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 | 0.000317 |
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name = 'term_str'
PHI.T.head().style.background_gradient()
topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
term_str | ||||||||||||||||||||||||||||||
a1 | 0.033333 | 0.037346 | 0.033333 | 0.033333 | 0.033333 | 2.029320 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 1.033333 | 0.033333 | 0.033333 |
aa | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 8.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 1.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 |
aaaall | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 12.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 |
aah | 0.033333 | 0.033333 | 0.033333 | 1.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 3.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 2.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 |
ac | 0.033333 | 2.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 1.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 1.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 | 0.033333 |
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
.groupby('topic_id')\
.apply(lambda x:
x.weight.sort_values(ascending=False)\
.head(10)\
.reset_index()\
.drop('topic_id',1)\
.term_str)
TOPICS
term_str | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 |
---|---|---|---|---|---|---|---|---|---|---|
topic_id | ||||||||||
0 | yeah | im | night | woo | ayy | dont | bitch | cause | niggas | girl |
1 | im | youre | hey | way | cause | baby | dont | time | girl | yeah |
2 | cause | im | blah | time | right | em | love | shimmy | ya | yeah |
3 | im | yeah | cause | heart | eh | time | way | tonight | love | baby |
4 | ya | im | time | baby | gonna | problem | cause | girl | love | heart |
5 | im | yeah | bitch | money | ayy | nigga | shit | man | niggas | fuck |
6 | love | yeah | baby | song | change | mama | ooh | oh | mind | nobrainer |
7 | dance | im | hey | feeling | way | diva | cause | body | yeah | home |
8 | yeah | hitta | youre | im | oh | hittas | bitch | time | life | cause |
9 | time | im | heart | yeah | love | way | head | life | girl | cause |
10 | yeah | imma | love | taste | im | bitch | ooh | ayy | aint | thats |
11 | yeah | ooh | im | money | party | ayy | girl | man | woo | rack |
12 | girl | youre | girls | dont | cause | yeah | world | im | aint | baby |
13 | im | yeah | dont | girl | time | love | heart | baby | youre | cause |
14 | im | yeah | cause | moves | money | bank | bitch | aint | shit | dance |
15 | im | yeah | youre | cause | sugar | round | ima | way | tonight | dont |
16 | yeah | hands | baby | lets | night | tonight | cause | time | girl | everybody |
17 | nigga | im | mmm | time | cause | way | baby | thats | shit | niggas |
18 | im | yeah | mmm | shes | hate | stop | dont | bottoms | baby | girlfriend |
19 | im | work | ass | yeah | niggas | cup | ayy | cause | bum | aint |
20 | baby | yeah | dont | youre | cause | love | time | im | night | ooh |
21 | im | youre | cause | wait | beggin | night | guy | ohoh | dont | song |
22 | bitch | im | friends | man | house | mediocre | hey | shit | way | college |
23 | youre | lot | im | home | yeah | love | baby | night | way | thing |
24 | im | yeah | cause | ooh | money | youre | girls | somethin | bout | years |
25 | thunder | yeah | moment | baby | girl | ill | time | money | im | youre |
26 | im | dont | baby | yeah | ooh | dance | love | bitch | gucci | girl |
27 | yeah | im | da | bitch | money | ah | look | way | hey | uh |
28 | yeah | im | baby | sorry | music | dance | bag | hands | cause | harder |
29 | boom | bitch | chick | hol | beat | ayy | ooh | im | havana | love |
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)
TOPICS['doc_weight_sum'] = THETA.sum()
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label', figsize=(5,10))
<AxesSubplot:ylabel='label'>
THETA.head()
topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | ||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
genre | artist | title | |||||||||||||||||||||
country | billycurrington | peoplearecrazy | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.978030 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | ... | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 | 0.000758 |
blakeshelton | boysroundhere | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | ... | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | 0.000392 | |
godgavemeyou | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.969792 | 0.001042 | ... | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | 0.001042 | ||
godscountry | 0.000469 | 0.000469 | 0.000469 | 0.576191 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.232773 | ... | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | 0.000469 | ||
honeybee | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | ... | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 | 0.000901 |
5 rows × 30 columns
topic_cols = [t for t in range(n_topics)]
GENRES = THETA.groupby('genre')[topic_cols].mean().T
GENRES.index.name = 'topic_id'
GENRES.T
topic_id | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
genre | |||||||||||||||||||||
country | 0.011465 | 0.143131 | 0.047043 | 0.059445 | 0.050820 | 0.000782 | 0.010506 | 0.007746 | 0.024339 | 0.077010 | ... | 0.079411 | 0.030221 | 0.018481 | 0.051493 | 0.030072 | 0.001040 | 0.010553 | 0.027269 | 0.032091 | 0.000782 |
pop | 0.018056 | 0.103750 | 0.024902 | 0.083782 | 0.039447 | 0.006277 | 0.020253 | 0.023194 | 0.028874 | 0.051602 | ... | 0.115541 | 0.028485 | 0.001994 | 0.063476 | 0.042379 | 0.008874 | 0.020815 | 0.025693 | 0.012347 | 0.009921 |
rap | 0.078607 | 0.061396 | 0.000411 | 0.048413 | 0.024895 | 0.195870 | 0.006101 | 0.013310 | 0.017888 | 0.036419 | ... | 0.050020 | 0.014382 | 0.024010 | 0.022399 | 0.005077 | 0.016396 | 0.036840 | 0.044016 | 0.022795 | 0.012059 |
rnbhiphop | 0.030709 | 0.105652 | 0.016000 | 0.055555 | 0.028383 | 0.093453 | 0.005096 | 0.014050 | 0.033677 | 0.052963 | ... | 0.072184 | 0.013938 | 0.006859 | 0.030758 | 0.019932 | 0.007247 | 0.042577 | 0.048929 | 0.026509 | 0.020197 |
4 rows × 30 columns
GENRES['topterms'] = TOPICS[[i for i in range(10)]].apply(lambda x: ' '.join(x), 1)
GENRES.sort_values('country', ascending=False).head(5).style.background_gradient()
genre | country | pop | rap | rnbhiphop | topterms |
---|---|---|---|---|---|
topic_id | |||||
1 | 0.143131 | 0.103750 | 0.061396 | 0.105652 | im youre hey way cause baby dont time girl yeah |
20 | 0.079411 | 0.115541 | 0.050020 | 0.072184 | baby yeah dont youre cause love time im night ooh |
9 | 0.077010 | 0.051602 | 0.036419 | 0.052963 | time im heart yeah love way head life girl cause |
13 | 0.076397 | 0.078158 | 0.036833 | 0.071086 | im yeah dont girl time love heart baby youre cause |
12 | 0.069020 | 0.038786 | 0.010123 | 0.024358 | girl youre girls dont cause yeah world im aint baby |
GENRES.sort_values('pop', ascending=False).head(5).style.background_gradient()
genre | country | pop | rap | rnbhiphop | topterms |
---|---|---|---|---|---|
topic_id | |||||
20 | 0.079411 | 0.115541 | 0.050020 | 0.072184 | baby yeah dont youre cause love time im night ooh |
1 | 0.143131 | 0.103750 | 0.061396 | 0.105652 | im youre hey way cause baby dont time girl yeah |
3 | 0.059445 | 0.083782 | 0.048413 | 0.055555 | im yeah cause heart eh time way tonight love baby |
13 | 0.076397 | 0.078158 | 0.036833 | 0.071086 | im yeah dont girl time love heart baby youre cause |
23 | 0.051493 | 0.063476 | 0.022399 | 0.030758 | youre lot im home yeah love baby night way thing |
GENRES.sort_values('rap', ascending=False).head(5).style.background_gradient()
genre | country | pop | rap | rnbhiphop | topterms |
---|---|---|---|---|---|
topic_id | |||||
5 | 0.000782 | 0.006277 | 0.195870 | 0.093453 | im yeah bitch money ayy nigga shit man niggas fuck |
0 | 0.011465 | 0.018056 | 0.078607 | 0.030709 | yeah im night woo ayy dont bitch cause niggas girl |
1 | 0.143131 | 0.103750 | 0.061396 | 0.105652 | im youre hey way cause baby dont time girl yeah |
10 | 0.006093 | 0.017072 | 0.054728 | 0.026031 | yeah imma love taste im bitch ooh ayy aint thats |
20 | 0.079411 | 0.115541 | 0.050020 | 0.072184 | baby yeah dont youre cause love time im night ooh |
GENRES.sort_values('rnbhiphop', ascending=False).head(5).style.background_gradient()
genre | country | pop | rap | rnbhiphop | topterms |
---|---|---|---|---|---|
topic_id | |||||
1 | 0.143131 | 0.103750 | 0.061396 | 0.105652 | im youre hey way cause baby dont time girl yeah |
5 | 0.000782 | 0.006277 | 0.195870 | 0.093453 | im yeah bitch money ayy nigga shit man niggas fuck |
20 | 0.079411 | 0.115541 | 0.050020 | 0.072184 | baby yeah dont youre cause love time im night ooh |
13 | 0.076397 | 0.078158 | 0.036833 | 0.071086 | im yeah dont girl time love heart baby youre cause |
3 | 0.059445 | 0.083782 | 0.048413 | 0.055555 | im yeah cause heart eh time way tonight love baby |
px.scatter(GENRES.reset_index(), 'rnbhiphop','country', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
px.scatter(GENRES.reset_index(), 'rnbhiphop','pop', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
px.scatter(GENRES.reset_index(), 'rnbhiphop','rap', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
px.scatter(GENRES.reset_index(), 'country','pop', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
px.scatter(GENRES.reset_index(), 'country','rap', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
px.scatter(GENRES.reset_index(), 'pop','rap', hover_name='topterms', text='topic_id')\
.update_traces(mode='text')
SIMS = pdist(normalize(PHI), metric='euclidean')
TREE = sch.linkage(SIMS, method='ward')
labels = ["{}: {}".format(a,b) for a, b in zip(GENRES.index, GENRES.topterms.tolist())]
plot_tree(TREE, labels)
<Figure size 432x288 with 0 Axes>