# imports 
import pandas as pd 
import numpy as np 
import ast 
import os 
from collections import Counter 

# for text analysis 
import nltk
from nltk.corpus import stopwords 
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('stopwords', quiet=True)
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
from sklearn.feature_extraction.text import TfidfVectorizer

# for image analysis 
from PIL import Image
import cv2

# for visualizations 
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
pio.renderers.default = 'notebook'
import matplotlib.pyplot as plt

# sanity check line 
print("All libraries loaded successfully")

All libraries loaded successfully

# load the data 
train_df = pd.read_csv('../data/text_with_ocr.csv')
val_df = pd.read_csv('../data/validation_data.csv')

# quick look at the data 
print(f"Training rows: {len(train_df)}")
print(f"Validation rows: {len(val_df)}")
print(f"Columns: {list(train_df.columns)}")
train_df.head(3)

Training rows: 5552
Validation rows: 650
Columns: ['OCR', 'image', 'hero', 'villain', 'victim', 'other']

# what do the entity tags look like 
    # note that tags are stored as string representations of Python lists, e.g. "['donald trump']"
# so they must be parsed properly 
def parse_tag(val):
    """ Safely parse a tag column from string-list to actual list"""
    try: 
        result = ast.literal_eval(val)
        if isinstance(result, list):
            return result 
        return []
    except: 
        return []
    
# apply the parser to all entity columns 
for col in ['hero', 'villain', 'victim', 'other']:
    train_df[col + '_parsed'] = train_df[col].apply(parse_tag)
    val_df[col + '_parsed'] = val_df[col].apply(parse_tag)

# add a category column based on the image filename prefix 
    # i.e., images named 'covid_memes_X.png' are COVID; 'memes_X.png' are US politics
train_df['category'] = train_df['image'].apply(
    lambda x: 'COVID' if str(x).startswith('covid') else 'US Politics'
)
val_df['category'] = val_df['image'].apply(
    lambda x: 'COVID' if str(x).startswith('covid') else 'US Politics'
)

print("Category breakdown (train):") 
print(train_df['category'].value_counts())
print()
print("Sample row OCR text:")
print(train_df['OCR'].iloc[0])

Category breakdown (train):
category
US Politics    2852
COVID          2700
Name: count, dtype: int64

Sample row OCR text:
Bernie or Elizabeth? Be informed. Compare them on the issues that matter. Issue: Who makes the dankest memes?

# clean OCR text 
    # OCR from memes is inherently noisy 
        # (e.g., picks up watermarks, usernames, website URLs, and formatting artifacts) 
# Here: do light cleaning (but careful not to string too much because the meme language is intentional)

import re 
stop_words = set(stopwords.words('english'))

def clean_ocr(text):
    """
    Light cleaning for meme OCR text: 
    - lowercase 
    - remove URLs and handles 
    - remove special characters but keep spaces 
    - strip extra whitespace 
    """
    if not isinstance(text, str):
        return ''
    # lowercase
    text = text.lower()    
    # remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text) 
    # remove @handles
    text = re.sub(r'@\w+', '', text) 
    # keep only letters + spaces 
    text = re.sub(r'[^a-z\s]', ' ', text) 
    # collapse whitespace 
    text = re.sub(r'\s+', ' ', text).strip() 

    return text 

train_df['ocr_clean'] = train_df['OCR'].apply(clean_ocr)
val_df['ocr_clean'] = val_df['OCR'].apply(clean_ocr)

# also compute word count on cleaned text (useful for EDA)
train_df['word_count'] = train_df['ocr_clean'].apply(lambda x: len(x.split()))

print("Preprocessing complete.")
print(f"Average word count per meme: {train_df['word_count'].mean():.1f}")
print(f"Median word count: {train_df['word_count'].median():.1f}")

Preprocessing complete.
Average word count per meme: 19.1
Median word count: 16.0

# flatten all entity tags and count by category 
    # for each role (villain, hero, victim), want to know:
        # which named entities appear most frequently
        # and does that differ between COVID memes and US Politics memes? 

def get_entity_counts(df, role, category=None):
    """ 
    Extract and count entities for a given role and optional category filter
    """
    col = role + '_parsed'
    if category: 
        df = df[df['category'] == category]
    all_entities = []
    for tags in df[col]:
        all_entities.extend(tags)
    return Counter(all_entities)

# get top villains, heroes, and victims for each category 
roles = ['villain', 'hero', 'victim']
categories = ['US Politics', 'COVID']

for role in roles: 
    print(f"\n=== Top 10 {role.upper()}S ===")
    for cat in categories: 
        counts = get_entity_counts(train_df, role, cat)
        print(f"\t {cat}: {counts.most_common(10)}")

=== Top 10 VILLAINS ===
	 US Politics: [('donald trump', 358), ('joe biden', 149), ('democratic party', 145), ('republican party', 136), ('barack obama', 74), ('democrats', 55), ('hiliary clinton', 44), ('republicans', 39), ('libertarian party', 33), ('hillary clinton', 28)]
	 COVID: [('donald trump', 146), ('china', 62), ('coronavirus', 60), ('2020', 31), ('covid19', 17), ('joe biden', 13), ('covid infected people', 13), ('barack obama', 11), ('chinese', 11), ('government', 10)]

=== Top 10 HEROS ===
	 US Politics: [('donald trump', 37), ('barack obama', 32), ('green party', 19), ('libertarian party', 17), ('joe biden', 12), ('bernie sanders', 12), ('jill stein', 11), ('libertarian', 7), ('gary johnson', 5), ('republican party', 5)]
	 COVID: [('chuck norris', 11), ('corona beer', 11), ('weed', 8), ('dr. anthony fauci', 6), ('joe biden', 5), ('donald trump', 5), ('china', 4), ('russia', 4), ('alcohol', 4), ('barack obama', 3)]

=== Top 10 VICTIMS ===
	 US Politics: [('donald trump', 37), ('america', 30), ('barack obama', 24), ('democratic party', 21), ('women', 15), ('people', 13), ('americans', 12), ('joe biden', 12), ('mexicans', 11), ('american people', 10)]
	 COVID: [('donald trump', 22), ('people', 20), ('china', 14), ('coronavirus', 12), ('parents', 8), ('world', 6), ('the world', 6), ('usa', 6), ('america', 6), ('americans', 6)]

# Visualization: top villains by category (side-by-side bar chart)
    # here focus on villains because they have the most tag coverage (1,884 rows) and are the most politically revealing 

fig = make_subplots(rows=1, cols=2,
                    subplot_titles=('US Politics Memes: Top Villains',
                                    'COVID Memes: Top Villains'))

for i, cat in enumerate(['US Politics', 'COVID'], 1):
    counts = get_entity_counts(train_df, 'villain', cat)
    top = counts.most_common(10)
    entities = [x[0] for x in top][::-1]    # reverse so highest at the top
    values = [x[1] for x in top][::-1]

    color = '#e74c3c' if cat == 'US Politics' else '#3498db'

    fig.add_trace(
        go.Bar(y=entities, x=values, orientation='h',
               marker_color=color, name=cat),
        row=1, col=i
    )

fig.update_layout(
    title='Who Gets Blamed? Top Villain Entities by Meme Category',
    height=420,
    showlegend=False,
    font=dict(size=11)
)
fig.show()

# Hero comparison 
fig2 = make_subplots(rows=1, cols=2,
                     subplot_titles=('US Politics Memes: Top Heroes',
                                     'COVID Memes: Top Heroes'))

for i, cat in enumerate(['US Politics', 'COVID'], 1):
    counts = get_entity_counts(train_df, 'hero', cat)
    top = counts.most_common(10)
    if not top:
        continue 
    entities = [x[0] for x in top][::-1]
    values = [x[1] for x in top][::-1]

    color = '#27ae60' if cat == 'US Politics' else '#16a085'

    fig2.add_trace(
        go.Bar(y=entities, x=values, orientation='h',
               marker_color=color, name=cat),
        row=1, col=i
    )

fig2.update_layout(
    title='Who Gets Praised? Top Hero Entities by Meme Category',
    height=420,
    showlegend=False,
    font=dict(size=11)
)
fig2.show()

# victim comparison
fig3 = make_subplots(rows=1, cols=2,
                     subplot_titles=('US Politics Memes: Top Victims',
                                     'COVID Memes: Top Victims'))

for i, cat in enumerate(['US Politics', 'COVID'], 1):
    counts = get_entity_counts(train_df, 'victim', cat)
    top = counts.most_common(10)
    if not top:
        continue
    entities = [x[0] for x in top][::-1]
    values   = [x[1] for x in top][::-1]

    color = '#8e44ad' if cat == 'US Politics' else '#d35400'

    fig3.add_trace(
        go.Bar(y=entities, x=values, orientation='h',
               marker_color=color, name=cat),
        row=1, col=i
    )

fig3.update_layout(
    title='Who Gets Hurt? Top Victim Entities by Meme Category',
    height=420,
    showlegend=False,
    font=dict(size=11)
)
fig3.show()

# run VADER sentiment on cleaned OCR text 
sid = SentimentIntensityAnalyzer()

def get_compound(text):
    """ 
    Return the VADER compound sentiment score for a text string.
    """
    if not isinstance(text, str) or len(text.strip()) ==0:
        return 0.0 
    return sid.polarity_scores(text)['compound']

train_df['sentiment'] = train_df['ocr_clean'].apply(get_compound)

print("Sentiment by category:")
print(train_df.groupby('category')['sentiment'].describe().round(3))

Sentiment by category:
              count   mean    std    min    25%  50%    75%    max
category                                                          
COVID        2700.0  0.000  0.390 -0.962 -0.178  0.0  0.231  0.975
US Politics  2852.0  0.103  0.477 -0.980 -0.225  0.0  0.494  0.986

# visualize sentiment distributions 
fig3 = go.Figure()

for cat, color in [('US Politics', '#e74c3c'), ('COVID', '#3498db')]:
    subset = train_df[train_df['category'] == cat]['sentiment']
    fig3.add_trace(go.Histogram(
        x=subset, 
        name=cat,
        opacity=0.65, 
        marker_color=color,
        nbinsx=40
    ))

fig3.update_layout(
    barmode='overlay',
    title='Sentiment Score Distribution by Meme Category', 
    xaxis_title='VADER Compound Score (negative ← 0 → positive)',
    yaxis_title='Count',
    height=380,
    legend=dict(x=0.01, y=0.99),
    font=dict(size=11)
)

fig3.show()

# box plot for cleaner comparison 
fig4 = px.box(
    train_df, x='category', y='sentiment',
    color='category',
    color_discrete_map={'US Politics': '#e74c3c', 'COVID': '#3498db'},
    title='Sentiment Score by Category (Box Plot)',
    labels={'sentiment': 'VADER Compound Score', 'category': 'Meme Category'},
    points='outliers'
)
fig4.update_layout(height=380, showlegend=False, font=dict(size=11))
fig4.show()

# TF-IDF to find category-distinctive vocabulary
    # treat all memes in each category as one big document 
        # then use TF-IDF to find words that are especially characteristic of politics memes vs. COVID memes relative to each other


# combine all OCR text per category into one string each 
politics_text = ' '.join(train_df[train_df['category']=='US Politics']['ocr_clean'])
covid_text = ' '.join(train_df[train_df['category']=='COVID']['ocr_clean'])

# fit TF-IDF on both "documents"
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)    # include bigrams for more meaningful phrases
)

tfidf_matrix = tfidf.fit_transform([politics_text, covid_text])
feature_names = tfidf.get_feature_names_out()

# extract top terms per category
def top_tfidf_terms(matrix, row_idx, feature_names, n=20):
    """ 
    Get the top n TF-IDF terms for a given row (category).
    """
    row = matrix[row_idx].toarray().flatten()
    top_idx = row.argsort()[::-1][:n]
    return [(feature_names[i], round(row[i], 4)) for i in top_idx]

politics_terms = top_tfidf_terms(tfidf_matrix, 0, feature_names)
covid_terms = top_tfidf_terms(tfidf_matrix, 1, feature_names)

print("Top TF-IDF terms — US Politics:")
for term, score in politics_terms[:15]:
    print(f"\t{term}: {score}")

print("\nTop TF-IDF terms — COVID:")
for term, score in covid_terms[:15]:
    print(f"\t{term}: {score}")

Top TF-IDF terms — US Politics:
	party: 0.5251
	biden: 0.2761
	trump: 0.275
	joe: 0.2457
	obama: 0.2379
	libertarian: 0.1874
	republican: 0.1643
	democratic party: 0.1617
	democratic: 0.1333
	com: 0.1306
	republican party: 0.1212
	libertarian party: 0.1205
	president: 0.1195
	people: 0.1107
	debate: 0.1065

Top TF-IDF terms — COVID:
	coronavirus: 0.3228
	virus: 0.3205
	home: 0.2894
	covid: 0.2584
	corona: 0.2157
	work: 0.2118
	mask: 0.2087
	trump: 0.1839
	work home: 0.1832
	china: 0.1738
	people: 0.1474
	com: 0.1451
	like: 0.1389
	corona virus: 0.1241
	day: 0.1102

# visualize TF-IDF as horizontal bar charts 
fig5 = make_subplots(rows=1, cols=2,
                     subplot_titles=('US Politics: Top Distinctive Terms',
                                     'COVID: Top Distinctive Terms'))

for i, (terms, color) in enumerate(
    [(politics_terms[:15], '#e74c3c'), (covid_terms[:15], '#3498db')], 1):

    words = [t[0] for t in terms][::-1]
    scores = [t[1] for t in terms][::-1]

    fig5.add_trace(
        go.Bar(y=words, x=scores, orientation='h',
               marker_color=color),
        row=1, col=i
    )

fig5.update_layout(
    title='TF-IDF: Most Distinctive Vocabulary by Category',
    height=480,
    showlegend=False,
    font=dict(size=10)
)

fig5.show()

# Load and organize images by category
    # images are already extracted into data/SampleImagesData/
        # two subfolders: covidMemes/ and usPoliticsMemes/


image_data = {}  # { filename: {'img': PIL Image, 'category': str} }

image_dirs = {
    'COVID': '../data/SampleImagesData/SampleImagesData/covidMemes',
    'US Politics': '../data/SampleImagesData/SampleImagesData/usPoliticsMemes'
}

for cat, folder in image_dirs.items():
    for fname in os.listdir(folder):
        if fname.endswith('.png'):
            img_path = os.path.join(folder, fname)
            img = Image.open(img_path).convert('RGB')
            image_data[fname] = {'img': img, 'category': cat}

print(f"Loaded {len(image_data)} images")
for cat in ['COVID', 'US Politics']:
    n = sum(1 for v in image_data.values() if v['category'] == cat)
    print(f"  {cat}: {n} images")

Loaded 16 images
  COVID: 8 images
  US Politics: 8 images

# analyze color features for each image
    # for each image compute: 
        # average brightness (V channel in HSV — higher = brighter)
        # average saturation (S channel in HSV — higher = more colorful)
        # dominant hue bin (H channel — e.g., warm reds vs cool blues)
# these are simple but interpretable features 


color_records = []

for fname, info in image_data.items():
    img = info['img']
    cat = info['category']

    # convert PIL image to numpy array, then to OpenCV BGR format
    img_np = np.array(img)
    img_bgr = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR)
    img_hsv = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2HSV)

    # split into H, S, V channels
    h_channel, s_channel, v_channel = cv2.split(img_hsv)

    # compute means (these are our image-level features)
    avg_brightness = v_channel.mean() # 0-255
    avg_saturation = s_channel.mean() # 0-255
    avg_hue = h_channel.mean() # 0-179 in OpenCV HSV

    color_records.append({
        'filename': fname,
        'category': cat,
        'brightness': avg_brightness,
        'saturation': avg_saturation,
        'hue': avg_hue
    })

color_df = pd.DataFrame(color_records)
print(color_df.groupby('category')[['brightness','saturation','hue']].mean().round(2))

             brightness  saturation    hue
category                                  
COVID            123.65       76.63  48.14
US Politics      132.27       55.18  49.58

# visualize color features
fig6 = make_subplots(rows=1, cols=3,
                     subplot_titles=('Brightness', 'Saturation', 'Hue'))

metrics = ['brightness', 'saturation', 'hue']
colors_map = {'US Politics': '#e74c3c', 'COVID': '#3498db'}

for col_idx, metric in enumerate(metrics, 1):
    for cat in ['US Politics', 'COVID']:
        subset = color_df[color_df['category']==cat][metric]
        fig6.add_trace(
            go.Box(
                y=subset,
                name=cat,
                marker_color=colors_map[cat],
                showlegend=(col_idx==1)    # only show legend once
            ),
            row=1, col=col_idx
        )

fig6.update_layout(
    title='Visual Color Features by Meme Category (16 sample images)',
    height=400,
    boxmode='group',
    font=dict(size=11)
)

fig6.show()

# estimate what proportion of each meme is text vs. image
    # strategy: convert to grayscale, apply adaptive thresholding to isolate
        # high-contrast text regions, then calculate what % of pixels are 'text'
    # NOTE: this is an approximation (meme text is usually bold, high-contrast, and distinct from photographic content)

text_coverage_records = []

for fname, info in image_data.items():
    img = info['img']
    cat = info['category']

    img_np = np.array(img)
    img_gray = cv2.cvtColor(img_np, cv2.COLOR_RGB2GRAY)

    # adaptive threshold isolates high-contrast regions (like bold meme text)
    thresh = cv2.adaptiveThreshold(
        img_gray, 255,
        cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
        cv2.THRESH_BINARY_INV,
        blockSize=15,
        C=10
    )

    # text coverage = proportion of pixels flagged as high-contrast
    total_pixels = thresh.size
    text_pixels = thresh.sum() // 255 
    coverage_pct = (text_pixels / total_pixels) * 100

    text_coverage_records.append({
        'filename': fname,
        'category': cat,
        'text_coverage_pct': round(coverage_pct, 2)
    })

coverage_df = pd.DataFrame(text_coverage_records)

# visualize 
    # show actual meme images with text coverage % annotated
        # much more informative than a dot plot with only 16 images
            # sorted by category then by coverage % so you can see the range within each group

sorted_coverage = coverage_df.sort_values(['category', 'text_coverage_pct'], ascending=[True, False])

fig, axes = plt.subplots(2, 8, figsize=(20, 6))
fig.suptitle('Estimated Text Coverage Across All 16 Sample Memes', fontsize=13, fontweight='bold', y=1.02)

for ax, (_, row) in zip(axes.flatten(), sorted_coverage.iterrows()):
    fname = row['filename']
    cat = row['category']
    pct = row['text_coverage_pct']
    
    # get the actual image
    img = image_data[fname]['img']
    
    ax.imshow(img)
    ax.axis('off')
    
    # color title by category
    color = '#3498db' if cat == 'COVID' else '#e74c3c'
    
    # annotate with filename and text coverage percentage
    ax.set_title(f"{fname.replace('.png','')}\nText: {pct}%", fontsize=6.5, color=color, fontweight='bold')

plt.tight_layout()
plt.show()

print("Mean text coverage by category:")
print(coverage_df.groupby('category')['text_coverage_pct'].mean().round(2))

Mean text coverage by category:
category
COVID          16.40
US Politics    12.89
Name: text_coverage_pct, dtype: float64

# build the summary table 
summary_records = []
for cat in ['US Politics', 'COVID']:
    subset_text = train_df[train_df['category']==cat]
    subset_color = color_df[color_df['category']==cat]
    subset_cov = coverage_df[coverage_df['category']==cat]

    top_villain = get_entity_counts(train_df, 'villain', cat).most_common(1)
    top_villain_str = top_villain[0][0] if top_villain else 'N/A'

    top_hero = get_entity_counts(train_df, 'hero', cat).most_common(1)
    top_hero_str = top_hero[0][0] if top_hero else 'N/A'

    top_victim = get_entity_counts(train_df, 'victim', cat).most_common(1)
    top_victim_str = top_victim[0][0] if top_victim else 'N/A'

    summary_records.append({
        'Category': cat,
        'N (train)': len(subset_text),
        'Avg Sentiment': round(subset_text['sentiment'].mean(), 3),
        'Top Villain': top_villain_str,
        'Top Hero': top_hero_str,
        'Top Victim': top_victim_str,
        'Avg Brightness': round(subset_color['brightness'].mean(), 1),
        'Avg Saturation': round(subset_color['saturation'].mean(), 1),
        'Avg Text Coverage %':round(subset_cov['text_coverage_pct'].mean(), 1)
    })

summary_table = pd.DataFrame(summary_records).set_index('Category')
print("=" * 48)
print("FULL FEATURE SUMMARY BY CATEGORY")
print("=" * 48)
print(summary_table.T.to_string())

================================================
FULL FEATURE SUMMARY BY CATEGORY
================================================
Category              US Politics         COVID
N (train)                    2852          2700
Avg Sentiment               0.103           0.0
Top Villain          donald trump  donald trump
Top Hero             donald trump  chuck norris
Top Victim           donald trump  donald trump
Avg Brightness              132.3         123.6
Avg Saturation               55.2          76.6
Avg Text Coverage %          12.9          16.4

from plotly.subplots import make_subplots

fig9 = make_subplots(
    rows=1, cols=2,
    subplot_titles=('US Politics Memes', 'COVID Memes'),
    shared_yaxes=True
)

# create villain_count column
train_df['villain_count'] = train_df['villain_parsed'].apply(len)

for i, (cat, color) in enumerate([('US Politics', '#e74c3c'), ('COVID', '#3498db')], 1):
    subset = train_df[train_df['category'] == cat]
    
    # cap villain count at 3 (covers 99.4% of data)
    subset = subset[subset['villain_count'] <= 3].copy()
    
    fig9.add_trace(
        go.Histogram2dContour(
            x=subset['sentiment'],
            y=subset['villain_count'],
            colorscale=[[0, 'rgba(255,255,255,0)'], [1, color]],
            showscale=False,
            ncontours=15,
            contours=dict(showlines=True, coloring='fill'),
            line=dict(width=0.5, color='rgba(0,0,0,0.25)'),
            name=cat
        ),
        row=1, col=i
    )
    
    # faint individual dots on top
    fig9.add_trace(
        go.Scatter(
            x=subset['sentiment'],
            y=subset['villain_count'],
            mode='markers',
            marker=dict(size=3, color=color, opacity=0.25),
            showlegend=False,
            hovertemplate='Sentiment: %{x:.2f}<br>Villains: %{y}<extra></extra>'
        ),
        row=1, col=i
    )

fig9.update_layout(
    title=dict(
        text='Where Do Memes Cluster? Sentiment vs. Villain Tags<br>'
             '<sup>Darker = more memes concentrated in that area | '
             'faint dots = individual memes | villain count capped at 5</sup>',
        x=0.5
    ),
    height=480,
    font=dict(size=11),
    showlegend=False
)

fig9.update_xaxes(
    title_text='Sentiment Score (negative <- 0 -> positive)',
    zeroline=True, zerolinecolor='grey', zerolinewidth=1.5,
    range=[-1.05, 1.05],
    showgrid=True, gridcolor='rgba(0,0,0,0.08)'
)
fig9.update_yaxes(
    title_text='Number of Villain Tags',
    dtick=1,
    range=[-0.3, 3.3],
    showgrid=True, gridcolor='rgba(0,0,0,0.08)'
)

fig9.show()

# apply the same preprocessing to val_df
val_df['ocr_clean'] = val_df['OCR'].apply(clean_ocr)
val_df['word_count'] = val_df['ocr_clean'].apply(lambda x: len(x.split()))
val_df['category'] = val_df['image'].apply(
    lambda x: 'COVID' if str(x).startswith('covid') else 'US Politics'
)
for col in ['hero', 'villain', 'victim', 'other']:
    val_df[col + '_parsed'] = val_df[col].apply(parse_tag)

val_df['sentiment'] = val_df['ocr_clean'].apply(get_compound)

print("Validation set category breakdown:")
print(val_df['category'].value_counts())
print()

# compare top villains, heroes, and victims between train and validation sets
for role in ['villain', 'hero', 'victim']:
    print("=" * 55)
    print(f"TOP 5 {role.upper()}S -- TRAIN vs. VALIDATION")
    print("=" * 55)
    for cat in ['US Politics', 'COVID']:
        train_top = get_entity_counts(train_df, role, cat).most_common(5)
        val_top = get_entity_counts(val_df,   role, cat).most_common(5)
        print(f"\n\t{cat}:")
        print(f"\tTrain: {[x[0] for x in train_top]}")
        print(f"\tValidation: {[x[0] for x in val_top]}")
    print()

# compare sentiment
print("=" * 55)
print("AVG SENTIMENT -- TRAIN vs. VALIDATION")
print("=" * 55)
for cat in ['US Politics', 'COVID']:
    train_sent = train_df[train_df['category']==cat]['sentiment'].mean()
    val_sent = val_df[val_df['category']==cat]['sentiment'].mean()
    print(f"\t{cat}: train={round(train_sent,3)}  validation={round(val_sent,3)}")

Validation set category breakdown:
category
US Politics    350
COVID          300
Name: count, dtype: int64

=======================================================
TOP 5 VILLAINS -- TRAIN vs. VALIDATION
=======================================================

	US Politics:
	Train: ['donald trump', 'joe biden', 'democratic party', 'republican party', 'barack obama']
	Validation: ['donald trump', 'joe biden', 'democratic party', 'republican party', 'republicans']

	COVID:
	Train: ['donald trump', 'china', 'coronavirus', '2020', 'covid19']
	Validation: ['donald trump', 'coronavirus', 'china', 'wuhan', 'joe biden']

=======================================================
TOP 5 HEROS -- TRAIN vs. VALIDATION
=======================================================

	US Politics:
	Train: ['donald trump', 'barack obama', 'green party', 'libertarian party', 'joe biden']
	Validation: ['donald trump', 'barack obama', 'republican party', 'joe biden', 'libertarian']

	COVID:
	Train: ['chuck norris', 'corona beer', 'weed', 'dr. anthony fauci', 'joe biden']
	Validation: ['joe biden', 'goicho saib', 'kamala harris', 'zoom meeting', 'darth vader']

=======================================================
TOP 5 VICTIMS -- TRAIN vs. VALIDATION
=======================================================

	US Politics:
	Train: ['donald trump', 'america', 'barack obama', 'democratic party', 'women']
	Validation: ['republican party', 'america', 'donald trump', 'barack obama', 'people']

	COVID:
	Train: ['donald trump', 'people', 'china', 'coronavirus', 'parents']
	Validation: ['donald trump', 'doctors', 'america', 'world', 'italy']

=======================================================
AVG SENTIMENT -- TRAIN vs. VALIDATION
=======================================================
	US Politics: train=0.103  validation=0.127
	COVID: train=0.0  validation=0.009

# visual sanity check
    # grouped bar chart comparing train vs. validation for top 5 entities per role (villain, hero, victim) + sentiment
        # if train and validation bars are similar heights, findings replicate
from plotly.subplots import make_subplots

fig_val = make_subplots(
    rows=2, cols=4,
    subplot_titles=(
        'US Politics: Villains', 'COVID: Villains',
        'US Politics: Heroes',   'COVID: Heroes',
        'US Politics: Victims',  'COVID: Victims',
        'Sentiment Comparison',    ''
    ),
    vertical_spacing=0.18,
    horizontal_spacing=0.08
)

roles = ['villain', 'hero', 'victim']
positions = [(1,1), (1,2), (1,3), (1,4), (2,1), (2,2)]

for idx, (role, cat) in enumerate([(r, c) for r in roles for c in ['US Politics', 'COVID']]):
    row, col = positions[idx]

    # get top 5 entities from training set
    train_top = get_entity_counts(train_df, role, cat).most_common(5)
    entities = [x[0] for x in train_top]

    # get counts for those same entities in validation set
    val_counts = get_entity_counts(val_df, role, cat)
    train_counts = [x[1] for x in train_top]
    val_counts_ = [val_counts.get(e, 0) for e in entities]

    # normalize by dataset size so comparison is fair
    # train has 5552 rows, val has 650 (need to normalize to per-1000 memes)
    train_norm = [c / len(train_df) * 1000 for c in train_counts]
    val_norm = [c / len(val_df)   * 1000 for c in val_counts_]

    # truncate long entity names for readability
    short_entities = [e[:12] + '..' if len(e) > 12 else e for e in entities]

    # train bars
    fig_val.add_trace(go.Bar(
        x=short_entities, y=train_norm,
        name='Train',
        marker_color='#2ecc71',
        opacity=0.85,
        showlegend=(idx == 0),
        legendgroup='train'
    ), row=row, col=col)

    # validation bars
    fig_val.add_trace(go.Bar(
        x=short_entities, y=val_norm,
        name='Validation',
        marker_color='#f39c12',
        opacity=0.85,
        showlegend=(idx == 0),
        legendgroup='val'
    ), row=row, col=col)

# sentiment comparison panel (row 2, col 3)
categories = ['US Politics', 'COVID']
train_sents = [train_df[train_df['category']==c]['sentiment'].mean() for c in categories]
val_sents = [val_df[val_df['category']==c]['sentiment'].mean()   for c in categories]

fig_val.add_trace(go.Bar(
    x=categories, y=train_sents,
    name='Train', marker_color='#2ecc71',
    opacity=0.85, showlegend=False,
    legendgroup='train'
), row=2, col=3)

fig_val.add_trace(go.Bar(
    x=categories, y=val_sents,
    name='Validation', marker_color='#f39c12',
    opacity=0.85, showlegend=False,
    legendgroup='val'
), row=2, col=3)

fig_val.update_layout(
    barmode='group',
    title=dict(
        text='Train vs. Validation: Do the Findings Replicate?<br>'
             '<sup>Counts normalized per 1,000 memes so train and validation are directly comparable</sup>',
        x=0.5
    ),
    height=620,
    font=dict(size=10),
    legend=dict(x=0.88, y=0.15),
    showlegend=True
)

fig_val.update_yaxes(title_text='Count per 1,000 memes')
fig_val.update_yaxes(title_text='Avg Sentiment Score', row=2, col=3)

fig_val.show()

# load all speech TSVs from the data_clean folder 
    # the dataset is organized as data_clean/{source}/{Speaker}/cleantext_{Speaker}.tsv
        # loop through all TSVs, load each one, and tag with speaker and source 

import os 
import glob 

speech_dfs = []

data_clean_path = '../data/data_clean'

for tsv_path in glob.glob(f'{data_clean_path}/**/*.tsv', recursive=True):
    parts = tsv_path.replace('\\', '/').split('/')
    source = parts[-3]    # e.g., cspan, votesmart, medium, millercenter
    speaker = parts[-2]    # e.g., DonaldTrump, JoeBiden 

    df = pd.read_csv(tsv_path, sep='\t', on_bad_lines='skip')
    df['speaker'] = speaker
    df['source_site'] = source
    speech_dfs.append(df)
    speech_df = pd.concat(speech_dfs, ignore_index=True)

    # add party label 
    party_map = {
        'DonaldTrump': 'Republican',
        'MikePence': 'Republican',
        'JoeBiden': 'Democrat',
        'KamalaHarris': 'Democrat'
    }
    speech_df['party'] = speech_df['speaker'].map(party_map)

    # normalize speech type column 
    speech_df['Type'] = speech_df['Type'].str.strip().str.lower()

    # parse date
    speech_df['Date'] = pd.to_datetime(speech_df['Date'], errors='coerce')

    # drop rows with no clean text 
    speech_df = speech_df.dropna(subset=['CleanText']).reset_index(drop=True)

    # quick look at data 
    print(f"Total speeches loaded: {len(speech_df)}")
    print()
    print("Speeches by speaker:")
    print(speech_df['speaker'].value_counts())
    print()
    print("Speeches by party:")
    print(speech_df['party'].value_counts())
    print()
    print("Date range:", speech_df['Date'].min().date(),
          "to", speech_df['Date'].max().date())

Total speeches loaded: 44

Speeches by speaker:
speaker
MikePence    44
Name: count, dtype: int64

Speeches by party:
party
Republican    44
Name: count, dtype: int64

Date range: 2019-01-17 to 2021-01-17
Total speeches loaded: 80

Speeches by speaker:
speaker
MikePence       44
KamalaHarris    36
Name: count, dtype: int64

Speeches by party:
party
Republican    44
Democrat      36
Name: count, dtype: int64

Date range: 2019-01-17 to 2021-01-17
Total speeches loaded: 186

Speeches by speaker:
speaker
JoeBiden        106
MikePence        44
KamalaHarris     36
Name: count, dtype: int64

Speeches by party:
party
Democrat      142
Republican     44
Name: count, dtype: int64

Date range: 2019-01-17 to 2021-01-17
Total speeches loaded: 283

Speeches by speaker:
speaker
JoeBiden        106
DonaldTrump      97
MikePence        44
KamalaHarris     36
Name: count, dtype: int64

Speeches by party:
party
Democrat      142
Republican    141
Name: count, dtype: int64

Date range: 2019-01-17 to 2021-01-17
Total speeches loaded: 375

Speeches by speaker:
speaker
MikePence       136
JoeBiden        106
DonaldTrump      97
KamalaHarris     36
Name: count, dtype: int64

Speeches by party:
party
Republican    233
Democrat      142
Name: count, dtype: int64

Date range: 2019-01-11 to 2021-01-17
Total speeches loaded: 464

Speeches by speaker:
speaker
MikePence       136
KamalaHarris    125
JoeBiden        106
DonaldTrump      97
Name: count, dtype: int64

Speeches by party:
party
Republican    233
Democrat      231
Name: count, dtype: int64

Date range: 2019-01-11 to 2021-01-17
Total speeches loaded: 641

Speeches by speaker:
speaker
JoeBiden        283
MikePence       136
KamalaHarris    125
DonaldTrump      97
Name: count, dtype: int64

Speeches by party:
party
Democrat      408
Republican    233
Name: count, dtype: int64

Date range: 2019-01-11 to 2021-01-29
Total speeches loaded: 876

Speeches by speaker:
speaker
DonaldTrump     332
JoeBiden        283
MikePence       136
KamalaHarris    125
Name: count, dtype: int64

Speeches by party:
party
Republican    468
Democrat      408
Name: count, dtype: int64

Date range: 2019-01-08 to 2021-01-29
Total speeches loaded: 915

Speeches by speaker:
speaker
DonaldTrump     332
JoeBiden        283
KamalaHarris    164
MikePence       136
Name: count, dtype: int64

Speeches by party:
party
Republican    468
Democrat      447
Name: count, dtype: int64

Date range: 2019-01-08 to 2021-01-29
Total speeches loaded: 1051

Speeches by speaker:
speaker
JoeBiden        419
DonaldTrump     332
KamalaHarris    164
MikePence       136
Name: count, dtype: int64

Speeches by party:
party
Democrat      583
Republican    468
Name: count, dtype: int64

Date range: 2019-01-08 to 2021-01-29
Total speeches loaded: 1052

Speeches by speaker:
speaker
JoeBiden        420
DonaldTrump     332
KamalaHarris    164
MikePence       136
Name: count, dtype: int64

Speeches by party:
party
Democrat      584
Republican    468
Name: count, dtype: int64

Date range: 2019-01-08 to 2021-01-29
Total speeches loaded: 1081

Speeches by speaker:
speaker
JoeBiden        420
DonaldTrump     361
KamalaHarris    164
MikePence       136
Name: count, dtype: int64

Speeches by party:
party
Democrat      584
Republican    497
Name: count, dtype: int64

Date range: 2019-01-08 to 2021-01-29

# apply VADER sentiment to each speech  
    # set up using the same get_compound function used in Step 3 (already defined earlier in notebook)

# CleanText is the pre-cleaned speech transcript from the dataset authors
    # lowercase CleanText before sentiment (VADER is case-sensitive)
        # the Republican speeches are stored in ALL CAPS which would artificially inflates scores later
speech_df['CleanText_lower'] = speech_df['CleanText'].str.lower()

# fix: score sentiment per 50-word chunk rather than full speech
    # VADER was designed for short texts 
        # scoring 3000-word documents compounds toward extreme values and isn't meaningful
        # chunking gives a fairer per-unit comparison across speakers
import re

def sentiment_by_chunks(text, chunk_size=50):
    """Split text into chunks of ~50 words and average VADER scores."""
    if not isinstance(text, str) or len(text.strip()) == 0:
        return 0.0
    words = text.split()
    chunks = [' '.join(words[i:i+chunk_size])
              for i in range(0, len(words), chunk_size)]
    scores = [get_compound(chunk) for chunk in chunks if chunk.strip()]
    return round(sum(scores) / len(scores), 4) if scores else 0.0

speech_df['sentiment'] = speech_df['CleanText_lower'].apply(sentiment_by_chunks)

print("Sentiment summary by speaker (chunked):")
print(speech_df.groupby('speaker')['sentiment'].describe().round(3))
print()
print("Sentiment summary by party (chunked):")
print(speech_df.groupby('party')['sentiment'].describe().round(3))

Sentiment summary by speaker (chunked):
              count   mean    std    min    25%    50%    75%    max
speaker                                                             
DonaldTrump   361.0  0.344  0.309 -0.901  0.192  0.306  0.551  0.942
JoeBiden      420.0  0.132  0.317 -0.737 -0.063  0.181  0.329  0.887
KamalaHarris  164.0  0.217  0.421 -0.848 -0.023  0.271  0.498  0.959
MikePence     136.0  0.507  0.193 -0.612  0.447  0.560  0.634  0.779

Sentiment summary by party (chunked):
            count   mean    std    min    25%    50%    75%    max
party                                                             
Democrat    584.0  0.156  0.351 -0.848 -0.053  0.208  0.371  0.959
Republican  497.0  0.389  0.291 -0.901  0.227  0.375  0.595  0.942

# sentiment distribution by speaker (box plot)
fig_s1 = px.box(
    speech_df,
    x='speaker', y='sentiment',
    color='party',
    color_discrete_map={'Republican': '#e74c3c', 'Democrat': '#3498db'},
    title='Sentiment Score by Speaker: 2020 Campaign Speeches (Scored per 50-word chunk and averaged - avoids VADER inflation on long documents)',
    labels={'sentiment': 'VADER Compound Score', 'speaker': 'Speaker'},
    points='outliers',
    category_orders={'speaker': ['DonaldTrump', 'MikePence', 'JoeBiden', 'KamalaHarris']}
)
fig_s1.update_layout(height=420, font=dict(size=11))
fig_s1.show()

# cross-modal comparison: meme sentiment vs speech sentiment
    # this is the key comparison chart of Step 7
        # plot average sentiment for each speaker/category side by side so we can directly see whether memes match the tone of real speeches

# meme sentiment averages (from Step 3)
meme_politics_sent = train_df[train_df['category']=='US Politics']['sentiment'].mean()
meme_covid_sent = train_df[train_df['category']=='COVID']['sentiment'].mean()

# speech sentiment averages by speaker (chunked scores from 7.1)
speech_sent = speech_df.groupby('speaker')['sentiment'].mean()

# build comparison dataframe
comparison_data = []

# meme rows
comparison_data.append({'label': 'US Politics Memes', 'sentiment': meme_politics_sent, 'type': 'Meme', 'color': '#e74c3c'})
comparison_data.append({'label': 'COVID Memes', 'sentiment': meme_covid_sent, 'type': 'Meme', 'color': '#3498db'})

# speech rows
speaker_colors = {
    'DonaldTrump': '#c0392b',
    'MikePence': '#e67e22',
    'JoeBiden': '#2980b9',
    'KamalaHarris': '#8e44ad'
}
speaker_labels = {
    'DonaldTrump': 'Trump Speeches',
    'MikePence': 'Pence Speeches',
    'JoeBiden': 'Biden Speeches',
    'KamalaHarris': 'Harris Speeches'
}
for spk, sent in speech_sent.items():
    comparison_data.append({
        'label': speaker_labels[spk],
        'sentiment': sent,
        'type': 'Speech',
        'color': speaker_colors[spk]
    })

comp_df = pd.DataFrame(comparison_data)

# horizontal bar chart (easy to compare at a glance)
fig_s2 = go.Figure()

for _, row in comp_df.iterrows():
    fig_s2.add_trace(go.Bar(
        y=[row['label']],
        x=[row['sentiment']],
        orientation='h',
        marker_color=row['color'],
        name=row['label'],
        showlegend=False,
        hovertemplate=f"{row['label']}: %{{x:.3f}}<extra></extra>"
    ))

# add a vertical line at 0
fig_s2.add_vline(x=0, line_width=1.5, line_dash='dash', line_color='grey')

# shade meme rows differently
fig_s2.add_hrect(y0=-0.5, y1=1.5, fillcolor='rgba(200,200,200,0.15)',
                  line_width=0, annotation_text='Memes',
                  annotation_position='right')
fig_s2.add_hrect(y0=1.5, y1=5.5, fillcolor='rgba(255,255,255,0)',
                  line_width=0, annotation_text='Speeches',
                  annotation_position='right')

fig_s2.update_layout(
    title=dict(
        text='Meme Sentiment vs. Speech Sentiment (Direct Comparison)<br>'
             '<sup>VADER compound score: negative <- 0 -> positive | dashed line = neutral | '
             'speech scores averaged per 50-word chunk</sup>',
        x=0.5
    ),
    xaxis=dict(title='Avg VADER Compound Score', range=[-0.3, 0.7]),
    yaxis=dict(title=''),
    height=420,
    font=dict(size=11),
    margin=dict(r=100)
)

fig_s2.show()

# TF-IDF on speeches by speaker
    # same approach as Step 3.3: treat each speaker as one big document then find the most distinctive vocabulary per speaker
from sklearn.feature_extraction.text import TfidfVectorizer

# combine all speech text per speaker into one string
speaker_docs = {}
for spk in ['DonaldTrump', 'JoeBiden', 'MikePence', 'KamalaHarris']:
    texts = speech_df[speech_df['speaker']==spk]['CleanText'].dropna()
    speaker_docs[spk] = ' '.join(texts.str.lower())
    speakers = list(speaker_docs.keys())
    documents = list(speaker_docs.values())

# fit TF-IDF -- include bigrams for more meaningful phrases
tfidf_speech = TfidfVectorizer(
    stop_words='english',
    max_features=5000,
    ngram_range=(1, 2)
)

tfidf_matrix_speech = tfidf_speech.fit_transform(documents)
feature_names_speech = tfidf_speech.get_feature_names_out()

def top_tfidf_terms(matrix, row_idx, feature_names, n=15):
    row = matrix[row_idx].toarray().flatten()
    top_idx = row.argsort()[::-1][:n]
    return [(feature_names[i], round(row[i], 4)) for i in top_idx]

print("Top TF-IDF terms by speaker:")
for i, spk in enumerate(speakers):
    terms = top_tfidf_terms(tfidf_matrix_speech, i, feature_names_speech, n=12)
    print(f"\n {spk}:")
    for term, score in terms:
        print(f" {term}: {score}")

Top TF-IDF terms by speaker:

 DonaldTrump:
 people: 0.2769
 great: 0.2386
 know: 0.2371
 said: 0.2351
 going: 0.2222
 want: 0.1917
 don: 0.1866
 like: 0.1748
 right: 0.1516
 thank: 0.1494
 country: 0.1468
 years: 0.1444

 JoeBiden:
 president: 0.2772
 people: 0.2712
 trump: 0.208
 america: 0.1763
 american: 0.1713
 going: 0.1663
 know: 0.1581
 country: 0.156
 just: 0.1325
 need: 0.1318
 make: 0.1283
 nation: 0.1215

 MikePence:
 president: 0.3737
 american: 0.2727
 applause: 0.2214
 america: 0.2141
 people: 0.1968
 know: 0.1912
 just: 0.171
 great: 0.16
 today: 0.1544
 trump: 0.1407
 years: 0.1394
 ve: 0.1387

 KamalaHarris:
 people: 0.3991
 know: 0.2257
 president: 0.1806
 let: 0.1742
 country: 0.1735
 america: 0.1735
 joe: 0.1573
 states: 0.1562
 justice: 0.1333
 health: 0.1307
 united: 0.1299
 fight: 0.1269

# visualize TF-IDF per speaker as side-by-side bar charts
fig_s3 = make_subplots(rows=1, cols=4,
                       subplot_titles=('Trump', 'Biden', 'Pence', 'Harris'))
speaker_colors_tfidf = {
    'DonaldTrump': '#c0392b',
    'JoeBiden': '#2980b9',
    'MikePence': '#e67e22',
    'KamalaHarris': '#8e44ad'
}

for col_idx, spk in enumerate(speakers, 1):
    i = speakers.index(spk)
    terms = top_tfidf_terms(tfidf_matrix_speech, i, feature_names_speech, n=12)
    words = [t[0] for t in terms][::-1]
    scores = [t[1] for t in terms][::-1]
    fig_s3.add_trace(
        go.Bar(y=words, x=scores, orientation='h',
               marker_color=speaker_colors_tfidf[spk]), row=1, col=col_idx
               )
    
fig_s3.update_layout(
    title='TF-IDF: Most Distinctive Vocabulary per Speaker (Campaign Speeches)',
    height=500,
    showlegend=False,font=dict(size=9)
)
fig_s3.show()

# overlap analysis: which meme TF-IDF terms also appear in speech TF-IDF?
    # this directly answers: does meme vocabulary mirror speech vocabulary?
        # get top 30 terms from each meme category (from Step 3.3)
            # assumes tfidf, tfidf_matrix, feature_names are still in memory from Step 3.3
politics_meme_terms = set(t[0] for t in top_tfidf_terms(tfidf_matrix, 0, feature_names, n=30))
covid_meme_terms = set(t[0] for t in top_tfidf_terms(tfidf_matrix, 1, feature_names, n=30))

# get top 30 terms from each speaker's speeches
print("Vocabulary overlap -- Meme terms that also appear in Speech TF-IDF top 30:")
print()

for i, spk in enumerate(speakers):
    speech_terms = set(t[0] for t in top_tfidf_terms(tfidf_matrix_speech, i, feature_names_speech, n=30))
    overlap_politics = politics_meme_terms & speech_terms
    overlap_covid = covid_meme_terms & speech_terms
    
    print(f" {spk}:")
    print(f" Overlap with US Politics meme terms ({len(overlap_politics)}): "
          f"{sorted(overlap_politics)}")
    print(f" Overlap with COVID meme terms ({len(overlap_covid)}): "
          f"{sorted(overlap_covid)}")
    print()

Vocabulary overlap -- Meme terms that also appear in Speech TF-IDF top 30:

 DonaldTrump:
 Overlap with US Politics meme terms (7): ['america', 'biden', 'don', 'just', 'like', 'people', 'president']
 Overlap with COVID meme terms (7): ['don', 'just', 'like', 'new', 'people', 'time', 'world']

 JoeBiden:
 Overlap with US Politics meme terms (8): ['america', 'donald', 'donald trump', 'just', 'like', 'people', 'president', 'trump']
 Overlap with COVID meme terms (7): ['just', 'like', 'people', 'time', 'trump', 'work', 'world']

 MikePence:
 Overlap with US Politics meme terms (6): ['america', 'just', 'like', 'people', 'president', 'trump']
 Overlap with COVID meme terms (8): ['day', 'just', 'like', 'people', 'time', 'trump', 'work', 'world']

 KamalaHarris:
 Overlap with US Politics meme terms (9): ['america', 'biden', 'joe', 'joe biden', 'just', 'people', 'president', 'trump', 'vote']
 Overlap with COVID meme terms (6): ['just', 'people', 'time', 'trump', 'work', 'working']

# COVID keyword frequency in speeches
    # filter to speeches from 2020 onward (the pandemic period)
covid_period = speech_df[speech_df['Date'] >= '2020-01-01'].copy()
print(f"Speeches from 2020 onward: {len(covid_period)}")
print(covid_period['speaker'].value_counts())
print()

# define COVID blame-framing keywords found in our meme analysis
covid_keywords = [
'china', 'chinese', 'wuhan', 'virus', 'coronavirus', 'covid',
'pandemic', 'mask', 'lockdown', 'quarantine', 'fauci',
'death', 'blame', 'failed', 'incompetent'
]

# count keyword mentions per speaker (normalized per 1000 words)
def keyword_rate(text, keywords):
    # Count keyword occurrences per 1000 words.
    if not isinstance(text, str):
        return {k: 0 for k in keywords}
    text_lower = text.lower()
    words = len(text_lower.split())
    if words == 0:
        return {k: 0 for k in keywords}
    return {k: text_lower.count(k) / words * 1000 for k in keywords}

# apply to each speech
kw_records = []

for _, row in covid_period.iterrows():
    rates = keyword_rate(row['CleanText'], covid_keywords)
    rates['speaker'] = row['speaker']
    rates['party'] = row['party']
    kw_records.append(rates)

kw_df = pd.DataFrame(kw_records)

# average keyword rate per speaker
kw_summary = kw_df.groupby('speaker')[covid_keywords].mean().round(3)
print("Average keyword mentions per 1,000 words (2020+ speeches):")
print(kw_summary.T.to_string())

Speeches from 2020 onward: 766
speaker
JoeBiden        373
DonaldTrump     263
KamalaHarris     77
MikePence        53
Name: count, dtype: int64

Average keyword mentions per 1,000 words (2020+ speeches):
speaker      DonaldTrump  JoeBiden  KamalaHarris  MikePence
china              0.874     0.297         0.042      0.421
chinese            0.061     0.046         0.024      0.022
wuhan              0.009     0.001         0.000      0.026
virus              1.130     2.032         1.364      1.243
coronavirus        0.604     0.486         0.798      1.184
covid              0.333     1.846         0.980      0.000
pandemic           0.457     2.038         1.730      0.779
mask               0.052     0.287         0.131      0.074
lockdown           0.034     0.024         0.000      0.000
quarantine         0.013     0.041         0.000      0.000
fauci              0.023     0.057         0.000      0.024
death              0.216     0.349         0.180      0.102
blame              0.015     0.032         0.000      0.000
failed             0.087     0.691         0.484      0.083
incompetent        0.008     0.006         0.000      0.000

# visualize COVID keyword usage per speaker as heatmap
    # heatmap lets us see at a glance which speakers use which blame-framing words most
import plotly.figure_factory as ff

# prepare matrix
kw_matrix = kw_summary.values
kw_speakers = kw_summary.index.tolist()
kw_labels = covid_keywords

# normalize each keyword to 0-1 so colors are comparable across keywords
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
kw_norm = scaler.fit_transform(kw_matrix.T).T    # normalize per keyword

fig_s4 = go.Figure(data=go.Heatmap(
    z=kw_norm,
    x=kw_labels,
    y=kw_speakers,
    colorscale='RdBu_r',
    text=kw_matrix.round(2),
    texttemplate='%{text}',
    textfont={'size': 9},
    hovertemplate='%{y} -- %{x}<br>Rate per 1000 words: %{text}<extra></extra>'))

fig_s4.update_layout(
    title=dict(
        text='COVID Blame-Framing Keywords in 2020+ Speeches<br>'
            '<sup>Color = normalized frequency | numbers = raw rate per 1,000 words</sup>',
    x=0.5   
    ),
    xaxis=dict(title='Keyword', tickangle=-35),
    yaxis=dict(title='Speaker'),
    height=350,
    font=dict(size=10)
)
fig_s4.show()

# load librosa and set up paths 
    # NOTE: librosa is the standard Python library for audio feature extraction
import librosa
import librosa.display
import numpy as np
import pandas as pd
import os
import glob

# paths
AUDIO_DIR = '../data/audio_data/data/split audio'
TIMESTAMP_DIR = '../data/audio_data/data/timestamps'

# verify files are found
mp3_files = glob.glob(os.path.join(AUDIO_DIR, '*.mp3'))
ts_files = glob.glob(os.path.join(TIMESTAMP_DIR, '*.csv'))

print(f"MP3 files found: {len(mp3_files)}")
print(f"Timestamp files found: {len(ts_files)}")
print()
for f in sorted(mp3_files):
    print(' ', os.path.basename(f))

MP3 files found: 17
Timestamp files found: 17

  us_election_2020_1st_presidential_debate_part1.mp3
  us_election_2020_1st_presidential_debate_part2.mp3
  us_election_2020_2nd_presidential_debate_part1.mp3
  us_election_2020_2nd_presidential_debate_part2.mp3
  us_election_2020_biden_town_hall_part1.mp3
  us_election_2020_biden_town_hall_part2.mp3
  us_election_2020_biden_town_hall_part3.mp3
  us_election_2020_biden_town_hall_part4.mp3
  us_election_2020_biden_town_hall_part5.mp3
  us_election_2020_biden_town_hall_part6.mp3
  us_election_2020_biden_town_hall_part7.mp3
  us_election_2020_trump_town_hall_1.mp3
  us_election_2020_trump_town_hall_2.mp3
  us_election_2020_trump_town_hall_3.mp3
  us_election_2020_trump_town_hall_4.mp3
  us_election_2020_vice_presidential_debate_1.mp3
  us_election_2020_vice_presidential_debate_2.mp3

ts_records = []
for ts_path in sorted(ts_files):
    basename = os.path.basename(ts_path).replace('_timestamp.csv', '')
    try:
        df = pd.read_csv(ts_path, header=None,
                         names=['id', 'start_time', 'end_time', 'text'],
                         on_bad_lines='skip')
        df['audio_file'] = basename
        ts_records.append(df)
    except Exception as e:
        print(f"Could not load {basename}: {e}")

print(f"Loaded {len(ts_records)} files")

timestamps_df = pd.concat(ts_records, ignore_index=True)
timestamps_df['start_time'] = pd.to_numeric(timestamps_df['start_time'], errors='coerce')
timestamps_df['end_time']   = pd.to_numeric(timestamps_df['end_time'],   errors='coerce')
timestamps_df = timestamps_df.dropna(subset=['start_time', 'end_time'])

print(f"Total utterances: {len(timestamps_df)}")
print(f"Audio files covered: {timestamps_df['audio_file'].nunique()}")
print(timestamps_df.head(3).to_string(index=False))

Loaded 17 files
Total utterances: 6527
Audio files covered: 17
     id  start_time  end_time                                                                                                                    text                                     audio_file
f000001         0.0      5.80                                             Gentlemen, a lot of people been waiting for this night, so let’s get going. us_election_2020_1st_presidential_debate_part1
f000002         5.8      9.00                                                                                 Our first subject is the Supreme Court. us_election_2020_1st_presidential_debate_part1
f000003         9.0     17.88 President Trump, you nominated Amy Coney Barrett over the weekend to succeed the late Ruth Bader Ginsburg on the Court. us_election_2020_1st_presidential_debate_part1

# load speaker labels from tokenised text split CSVs
    # the _split.csv files in 'tokenised text/' have two columns:
        # 'speaker' and 'text' -- one row per utterance in order
# we use these to know which utterances belong to which speaker then cross-reference with the timestamp CSVs by row position

TOKENISED_DIR = '../data/audio_data/data/tokenised text'

# debate stem -> split CSV filename
split_csv_map = {
 'us_election_2020_1st_presidential_debate':
 'us_election_2020_1st_presidential_debate_split.csv',
 'us_election_2020_2nd_presidential_debate':
 'us_election_2020_2nd_presidential_debate_split.csv',
 'us_election_2020_biden_town_hall':
 'us_election_2020_biden_town_hall_split.csv',
 'us_election_2020_trump_town_hall':
 'us_election_2020_trump_town_hall_split.csv',
 'us_election_2020_vice_presidential_debate':
 'us_election_2020_vice_presidential_debate_split.csv',
}

speaker_dfs = {}
for stem, csv_name in split_csv_map.items():
    path = os.path.join(TOKENISED_DIR, csv_name)
    try:
        df = pd.read_csv(path)
        speaker_dfs[stem] = df
        print(f"{stem}: {len(df)} rows,\t" f"speakers: {df['speaker'].unique().tolist()}")
    except FileNotFoundError:
       print(f"Not found: {path}")

us_election_2020_1st_presidential_debate: 1904 rows,	speakers: ['Chris Wallace', 'Joe Biden', 'Donald Trump']
us_election_2020_2nd_presidential_debate: 1674 rows,	speakers: ['Kristen Welker', 'Donald Trump', 'Joe Biden']
us_election_2020_biden_town_hall: 838 rows,	speakers: ['George Stephanopoulos', 'Joe Biden', 'Audience Member 1', 'Audience Member 2', 'Audience Member 3', 'Voice Over', 'Audience Member 4', 'Audience Member 5', 'Audience Member 6', 'Audience Member 7', 'Audience Member 8', 'Audience Member 9', 'Audience Member 10', 'Audience Member 11']
us_election_2020_trump_town_hall: 1138 rows,	speakers: ['Savannah Guthrie', 'Voice Over', 'Donald Trump', 'Audience Member 12', 'Audience Member 13', 'Audience Member 14', 'Audience Member 15', 'Audience Member 16', 'Audience Member 17', 'Audience Member 18', 'Audience Member 19']
us_election_2020_vice_presidential_debate: 1028 rows,	speakers: ['Susan Page', 'Kamala Harris', 'Mike Pence']

# build a mapping from debate stem name to full MP3 path
    # the timestamp CSV stems match the MP3 filenames exactly
audio_map = {}

for mp3_path in mp3_files:
    key = os.path.basename(mp3_path).replace('.mp3', '')
    audio_map[key] = mp3_path

print("Audio file mapping confirmed:")
for k in sorted(audio_map.keys()):
    print(f"\t{k}")

Audio file mapping confirmed:
	us_election_2020_1st_presidential_debate_part1
	us_election_2020_1st_presidential_debate_part2
	us_election_2020_2nd_presidential_debate_part1
	us_election_2020_2nd_presidential_debate_part2
	us_election_2020_biden_town_hall_part1
	us_election_2020_biden_town_hall_part2
	us_election_2020_biden_town_hall_part3
	us_election_2020_biden_town_hall_part4
	us_election_2020_biden_town_hall_part5
	us_election_2020_biden_town_hall_part6
	us_election_2020_biden_town_hall_part7
	us_election_2020_trump_town_hall_1
	us_election_2020_trump_town_hall_2
	us_election_2020_trump_town_hall_3
	us_election_2020_trump_town_hall_4
	us_election_2020_vice_presidential_debate_1
	us_election_2020_vice_presidential_debate_2

# assign speakers to utterances based on debate structure
    # presidential debates: Trump and Biden both present
    # biden town hall parts: Biden only
    # trump town hall parts: Trump only
    # vice presidential debate: Pence and Harris

def assign_speaker_from_file(audio_file):
    """
    Returns speaker name(s) present in a given audio file. 
    For debates with multiple speakers we return 'mixed', speaker-level splitting requires the preprocessed dataset.
    """
    af = audio_file.lower()
    if 'biden_town_hall' in af:
        return 'Joe Biden'
    elif 'trump_town_hall' in af:
        return 'Donald Trump'
    elif 'vice_presidential' in af:
        return 'mixed_pence_harris'
    elif 'presidential_debate' in af:
        return 'mixed_trump_biden'
    return 'unknown'

timestamps_df['inferred_speaker'] = timestamps_df['audio_file'].apply(assign_speaker_from_file)

# keep only single-speaker files for clean feature extraction
single_speaker = timestamps_df[~timestamps_df['inferred_speaker'].str.startswith('mixed')].copy()

print("Utterances by inferred speaker (single-speaker files only):")
print(single_speaker['inferred_speaker'].value_counts())

Utterances by inferred speaker (single-speaker files only):
inferred_speaker
Donald Trump    1132
Joe Biden        838
Name: count, dtype: int64

# extract acoustic features for sampled utterances per speaker
    # for each utterance we clip the right seconds of audio using the timestamp
    # then extract ZCR, RMS, MFCC, and Chroma using librosa
# NOTE: this may take a few minutes depending on your machine
 
MAX_UTTERANCES = 100    # increase if you have more time/compute
SR = 22050    # librosa default sample rate
 
 
def extract_features(audio_path, start_sec, end_sec, sr=SR):
    """
    Load only the utterance segment using offset + duration.
    Returns a dict of feature values or None if the clip is too short
    """
    duration = end_sec - start_sec
    if duration <= 0.1:
        return None
    try:
        y, _ = librosa.load(
            audio_path, sr=sr, offset=start_sec, duration=duration
        )
        if len(y) < sr * 0.1:
            return None
 
        # zero crossing rate -- higher = more energetic consonant-heavy speech
        zcr = float(np.mean(librosa.feature.zero_crossing_rate(y)))
 
        # RMS energy -- average loudness of the utterance
        rms = float(np.mean(librosa.feature.rms(y=y)))
 
        # MFCCs -- 13 mel-frequency cepstral coefficients
        # captures the tonal/spectral fingerprint of the voice
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
        mfcc_mean = np.mean(mfcc, axis=1)
 
        # Chroma STFT -- 12 pitch class features
        # captures harmonic content, highlighted in Chowdhury et al. (2025)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr)
        chroma_mean = float(np.mean(chroma))
 
        result = {'zcr': zcr, 'rms': rms, 'chroma': chroma_mean}
        for j, val in enumerate(mfcc_mean):
            result[f'mfcc_{j+1}'] = float(val)
        return result
 
    except Exception:
        return None
 
 
# sample utterances per speaker and extract features
feature_records = []
 
for speaker in ['Joe Biden', 'Donald Trump']:
    subset = single_speaker[
        single_speaker['inferred_speaker'] == speaker
    ]
    subset = subset.sample(
        n=min(MAX_UTTERANCES, len(subset)), random_state=42
    )
    print(f"Extracting features for {speaker} ({len(subset)} utterances)...")
 
    for _, row in subset.iterrows():
        audio_path = audio_map.get(row['audio_file'])
        if not audio_path:
            continue
        feats = extract_features(
            audio_path, row['start_time'], row['end_time']
        )
        if feats:
            feats['speaker'] = speaker
            feats['audio_file'] = row['audio_file']
            feature_records.append(feats)
 
feat_df = pd.DataFrame(feature_records)
print(f"\nFeatures extracted: {len(feat_df)} utterances")
print()
print("Summary by speaker:")
print(
    feat_df.groupby('speaker')[['zcr', 'rms', 'chroma', 'mfcc_1']]
    .describe()
    .round(4)
)

Extracting features for Joe Biden (100 utterances)...
Extracting features for Donald Trump (100 utterances)...

Features extracted: 197 utterances

Summary by speaker:
                zcr                                                          \
              count    mean     std     min     25%     50%     75%     max   
speaker                                                                       
Donald Trump   97.0  0.1519  0.0386  0.0738  0.1276  0.1515  0.1775  0.2514   
Joe Biden     100.0  0.1053  0.0251  0.0403  0.0904  0.1011  0.1213  0.1921   

                rms          ... chroma         mfcc_1                     \
              count    mean  ...    75%     max  count      mean      std   
speaker                      ...                                            
Donald Trump   97.0  0.0435  ...  0.361  0.4849   97.0 -243.0097  26.1616   
Joe Biden     100.0  0.0424  ...  0.396  0.6170  100.0 -261.5239  15.3010   

                                                                
                   min       25%       50%       75%       max  
speaker                                                         
Donald Trump -328.3606 -260.2521 -248.7375 -232.2854 -167.4580  
Joe Biden    -304.4760 -272.0928 -260.5080 -251.2004 -219.8857  

[2 rows x 32 columns]

# box plots: ZCR, RMS, Chroma, and MFCC-1 by speaker
    # four panels side by side, colored by party
        # boxmean=True shows the mean as a dashed line inside each box
import plotly.graph_objects as go
from plotly.subplots import make_subplots
 
speaker_colors = {'Donald Trump': '#c0392b', 'Joe Biden': '#2980b9'}
 
features_to_plot = ['zcr', 'rms', 'chroma', 'mfcc_1']
feature_labels = [
    'Zero Crossing Rate (ZCR)',
    'RMS Energy',
    'Chroma STFT (Harmonic Content)',
    'MFCC-1 (Spectral Energy)'
]
 
fig_a1 = make_subplots(rows=1, cols=4, subplot_titles=feature_labels)
 
for col_idx, feat in enumerate(features_to_plot, 1):
    for speaker in ['Donald Trump', 'Joe Biden']:
        subset = feat_df[feat_df['speaker'] == speaker][feat].dropna()
        fig_a1.add_trace(
            go.Box(
                y=subset,
                name=speaker,
                marker_color=speaker_colors[speaker],
                showlegend=(col_idx == 1),
                legendgroup=speaker,
                boxmean=True
            ),
            row=1, col=col_idx
        )
 
fig_a1.update_layout(
    title=dict(
        text=(
            'Acoustic Feature Comparison: Trump vs. Biden '
            '(2020 Town Hall Debates)<br>'
            '<sup>Each point = one utterance | '
            'dashed line inside box = mean | '
            'features follow Chowdhury et al. (2025)</sup>'
        ),
        x=0.5
    ),
    height=480,
    font=dict(size=10),
    boxmode='group',
    legend=dict(title='Speaker')
)
fig_a1.show()

# print numerical summary for more interpretation
print("=" * 60)
print("ACOUSTIC FEATURE SUMMARY BY SPEAKER")
print("=" * 60)
summary = feat_df.groupby('speaker')[
    ['zcr', 'rms', 'chroma', 'mfcc_1']
].agg(['mean', 'median', 'std']).round(5)
print(summary.to_string())

============================================================
ACOUSTIC FEATURE SUMMARY BY SPEAKER
============================================================
                  zcr                        rms                     chroma                       mfcc_1                     
                 mean   median      std     mean   median      std     mean   median      std       mean     median       std
speaker                                                                                                                      
Donald Trump  0.15191  0.15155  0.03864  0.04352  0.04285  0.00765  0.34061  0.33933  0.03908 -243.00967 -248.73750  26.16161
Joe Biden     0.10530  0.10111  0.02506  0.04240  0.04358  0.00661  0.38049  0.37709  0.05093 -261.52390 -260.50801  15.30103

# build cross-modal comparison table
    # acoustic averages per speaker from Step 8.2
acoustic_summary = feat_df.groupby('speaker')[
    ['zcr', 'rms', 'chroma', 'mfcc_1']
].mean()
 
# villain counts from meme data (Step 3)
    # how many times does each politician appear as villain across all memes?
speaker_to_meme = {
    'Donald Trump': 'donald trump',
    'Joe Biden':    'joe biden'
}
 
villain_counts = {}
for speaker, meme_name in speaker_to_meme.items():
    count = sum(
        1 for tags in train_df['villain_parsed']
        if meme_name in [t.lower() for t in tags]
    )
    villain_counts[speaker] = count
 
print("Villain tag counts in meme dataset:")
for spk, cnt in villain_counts.items():
    print(f"  {spk}: {cnt} villain appearances")
print()
print("Acoustic feature averages:")
print(acoustic_summary.round(5).to_string())

Villain tag counts in meme dataset:
  Donald Trump: 504 villain appearances
  Joe Biden: 162 villain appearances

Acoustic feature averages:
                  zcr      rms   chroma     mfcc_1
speaker                                           
Donald Trump  0.15191  0.04352  0.34061 -243.00967
Joe Biden     0.10530  0.04240  0.38049 -261.52390

# combined visualization: acoustic profile vs. meme villain count
    # 4 panels: ZCR, RMS, Chroma, and Villain Count side by side directly shows whether acoustic energy aligns with meme blame assignment
fig_a2 = make_subplots(
    rows=1, cols=4,
    subplot_titles=(
        'Avg ZCR (Speech Energy)',
        'Avg RMS Energy',
        'Avg Chroma (Harmonic)',
        'Villain Tags in Memes'
    )
)
 
speakers  = ['Donald Trump', 'Joe Biden']
sp_colors = ['#c0392b', '#2980b9']
 
for col_idx, feat in enumerate(['zcr', 'rms', 'chroma'], 1):
    for spk, col in zip(speakers, sp_colors):
        fig_a2.add_trace(
            go.Bar(
                x=[spk],
                y=[acoustic_summary.loc[spk, feat]],
                marker_color=col,
                name=spk,
                showlegend=(col_idx == 1),
                legendgroup=spk
            ),
            row=1, col=col_idx
        )
 
# villain counts panel
for spk, col in zip(speakers, sp_colors):
    fig_a2.add_trace(
        go.Bar(
            x=[spk],
            y=[villain_counts[spk]],
            marker_color=col,
            name=spk,
            showlegend=False,
            legendgroup=spk
        ),
        row=1, col=4
    )
 
fig_a2.update_layout(
    title=dict(
        text=(
            'Acoustic Energy vs. Meme Villain Framing -- '
            'Trump vs. Biden<br>'
            '<sup>Does higher speech energy match higher villain '
            'assignment in memes?</sup>'
        ),
        x=0.5
    ),
    height=430,
    font=dict(size=10),
    barmode='group',
    showlegend=True
)
fig_a2.show()

	OCR	image	hero	villain	victim	other
0	Bernie or Elizabeth? Be informed. Compare them...	covid_memes_18.png	NaN	NaN	NaN	['bernie sanders', 'elizabeth warren']
1	Extending the Brexit deadline until October 31...	covid_memes_19.png	NaN	['uk government']	NaN	NaN
2	kwai gkwa 0964 #nnevvy applause to Thais from ...	covid_memes_252.png	['thais']	NaN	NaN	['hong kong']

Mini-Project 3: Beyond the Joke: A Multimodal Analysis of Blame and Framing in US Political and COVID Memes¶

1. Dataset (Creation and Preprocessing)¶

1.1: Load Libraries and Data¶

1.2: Inspect and Understand the Structure¶

1.3: Preprocessing the OCR Text¶

Preprocessing Summary¶

2. Primary Methods¶

3. Text Analysis¶

3.1: Entity Frequency: Who gets cast as villain, hero, and victim?¶

3.1: Entity Frequency (brief interpretation based on code results and figures above):¶

3.2: Sentiment Analysis with VADER¶

3.2: Sentiment Analysis (brief interpretation based on code results and figures above):¶

3.3: TF-IDF (What Words Distinguish Each Category?)¶

3.3: TF-IDF (brief interpretation based on code results and figures above):¶

4. Image Analysis¶

4.1: Load and Organize Images¶

4.2 Color Palette Analysis¶

4.2: Color Palette Analysis (brief interpretation based on code results and figures above):¶

4.3 Text Coverage Ratio¶

4.3: Text Coverage Ratio (brief interpretation based on code results and figures above):¶

5. Putting it Together (Combined Results Summary)¶

5: Combined Summary (brief interpretation based on code results and figures above):¶

6. Stability Check (do our findings hold on the validation set?)¶

6: Stability Check (brief interpretation based on code results and figures above):¶

7. Expand Analysis to Audio/Speech Data¶

7.1 Load and Prepare the Speech Dataset¶

7.2 Sentiment: Speeches vs. Memes¶

7.2: Sentiment (brief interpretation based on figures above):¶

7.3 TF-IDF: Speech Vocabulary vs. Meme Vocabulary¶

7.3: TF-IDF (brief interpretation based on code results and figures above):¶

7.4 COVID Language: Do Speeches Blame the Same Targets as Memes?¶

7.4: Covid Language (brief interpretation based on code results and figures above):¶

8. Acoustic Feature Analysis¶

8.1 Install librosa and Load Timestamp Data¶

8.2 Extract Acoustic Features per Speaker¶

8.3 Visualize: Do the Speakers Sound Different?¶

8.3: Audio Feature Comparison (brief interpretation based on code results and figures above):¶

8.4 Connecting Acoustic Features to Meme Framing¶

8.4: Acoustic to Meme Connection (brief interpretation based on code results and figures above):¶

Results/Discussion¶

Limitations & Directions for Future Study¶

References¶

Datasets:¶

Methods and Tools:¶