import pandas as pd
import os 

# Build path relative to this notebook's location
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
data_dir = os.path.join(notebook_dir, '..', 'data')

# read in just the first 3 rows of each dataset to take a look
for fname in ['data.csv', 'distribution.csv', 'prompts.csv']:
    df = pd.read_csv(os.path.join(data_dir, fname), nrows=3)
    print(f"\n=== {fname} ===")
    print(f"Columns: {df.columns.tolist()}")
    print(df.head(3))

=== data.csv ===
Columns: ['text', 'source', 'prompt_id', 'text_length', 'word_count']
                                                text    source  prompt_id  \
0  Federal law supersedes state law, and cannabis...  Bloom-7B          0   
1  Miles feels restless after working all day. He...  Bloom-7B          0   
2  So first of I am danish. That means that I fol...  Bloom-7B          0   

   text_length  word_count  
0          967         157  
1         5068         778  
2         1602         267  

=== distribution.csv ===
Columns: ['Source', 'Number of Samples', 'Percentage of Total Data', 'Text Length Sum', 'Text Length Mean', 'Text Length Median', 'Text Length Std', 'Text Length Max', 'Text Length Min', 'Word Count Sum', 'Word Count Mean', 'Word Count Median', 'Word Count Std', 'Word Count Max', 'Word Count Min']
             Source  Number of Samples Percentage of Total Data  \
0             Human             347692                 44.0718%   
1           GPT-3.5              52346                  6.6351%   
2  Text-Davinci-003              22860                  2.8976%   

   Text Length Sum  Text Length Mean  Text Length Median  Text Length Std  \
0       1555649148          4474.216                2288         6989.088   
1        147829489          2824.084                3290         1797.105   
2         21012437           919.179                 727          590.805   

   Text Length Max  Text Length Min  Word Count Sum  Word Count Mean  \
0           890119              110       246977688          710.335   
1            23940              116        22633379          432.380   
2             5313              116         3584391          156.798   

   Word Count Median  Word Count Std  Word Count Max  Word Count Min  
0                396        1003.481           71543              25  
1                505         263.848            3565              25  
2                121          98.634             822              25  

=== prompts.csv ===
Columns: ['Prompt ID', 'Prompt']
   Prompt ID                                             Prompt
0          0                                          Undefined
1          1  Anything, be creative and make this about any ...
2          2                   Does the electoral college work?

# Load full distribution to see all sources
dist_df = pd.read_csv(os.path.join(data_dir, 'distribution.csv'))
print(dist_df[['Source', 'Number of Samples', 'Percentage of Total Data']])

                      Source  Number of Samples Percentage of Total Data
0                      Human             347692                 44.0718%
1                    GPT-3.5              52346                  6.6351%
2           Text-Davinci-003              22860                  2.8976%
3           Text-Davinci-002              21436                  2.7171%
4                   OPT-1.3B              18467                  2.3408%
..                       ...                ...                      ...
58                Toppy-M-7B                433                  0.0549%
59                LLaMA-2-7B                409                  0.0518%
60      Dolphin-Mixtral-8x7B                407                  0.0516%
61            Cohere-Command                390                  0.0494%
62  Dolphin-2.5-Mixtral-8x7B                228                  0.0289%

[63 rows x 3 columns]

import numpy as np

# configuration step
    # defining key parameters for sampling 
SAMPLE_SIZE = 10000
RANDOM_SEED = 230    # setting a random seed for reproducibility (can be anything - INFO 230 - for fun!)

# using os path for reproducibility on another machine 
notebook_dir = os.path.dirname(os.path.abspath("__file__"))
data_dir = os.path.join(notebook_dir, '..', 'data')

# First loading in full dataset then immediately taking a random sample to keep computation manageable
print("Loading data.csv...")
ai_human_df = pd.read_csv(os.path.join(data_dir, 'data.csv'))
print(f"Full dataset size: {len(ai_human_df):,} rows")

# Reproducible random sample
ai_human_df_sample = ai_human_df.sample(n=SAMPLE_SIZE, random_state=RANDOM_SEED).reset_index(drop=True)
print(f"Sample size: {len(ai_human_df_sample):,} rows")

# Create the binary label 
    # original data had 62+ unique LLM names plus 'human' 
    # binary classification simplify this to: 
        # Human = 0, all LLMs = 1
ai_human_df_sample['label'] = (ai_human_df_sample['source'] != 'Human').astype(int)

# save sample data as csv
ai_human_df_sample.to_csv(os.path.join(data_dir, 'ai_human_sample_10k.csv'), index=False)

print(f"\nLabel distribution in sample:")
print(ai_human_df_sample['label'].value_counts().rename({0: 'Human', 1: 'AI'}))
print(f"\nSample preview:")
ai_human_df_sample[['text', 'source', 'label', 'word_count']].head(3)

Loading data.csv...
Full dataset size: 788,922 rows
Sample size: 10,000 rows

Label distribution in sample:
label
AI       5548
Human    4452
Name: count, dtype: int64

Sample preview:

import matplotlib.pyplot as plt
import seaborn as sns

# Part 1: Sample-level Stats
print("===CORPUS STATISTICS===")

# confirming sample size relative to the full 788k corpus
print(f"\n ---Document Counts---")
print(f"\t Full dataset: {len(ai_human_df):>10,} documents")
print(f"\t Working sample: {len(ai_human_df_sample):>10,} documents")
print(f"\t Sample fraction: {len(ai_human_df_sample)/len(ai_human_df)*100:.2f}% of full corpus")

print(f"\n ---Label Distribution (Sample)---")
label_counts = ai_human_df_sample['label'].value_counts().rename({0: 'Human', 1: 'AI'})
label_pcts = ai_human_df_sample['label'].value_counts(normalize=True).rename({0: 'Human', 1: 'AI'}) * 100
for label in ['Human', 'AI']:
    print(f"\t {label:<8} {label_counts[label]:>6,} documents ({label_pcts[label]:.1f}%)")

print(f"\n ---Text Length Statistics (Sample, in words)---")
for label_name, label_val in [('Human', 0), ('AI', 1)]:
    subset = ai_human_df_sample[ai_human_df_sample['label'] == label_val]['word_count']
    print(f"\n {label_name}:")
    print(f"\t Mean: {subset.mean():.1f} words")
    print(f"\t Median: {subset.median():.1f} words")
    print(f"\t Stdev: {subset.std():.1f} words")
    print(f"\t Min: {subset.min()} words")
    print(f"\t Max: {subset.max()} words")

# Approximate vocabulary: all unique whitespace-separated tokens across the sample
print(f"\n ---Vocabulary Size (Sample)---")
all_words = ai_human_df_sample['text'].str.lower().str.split().explode()
print(f"\t Unique tokens (raw):  {all_words.nunique():,}")

===CORPUS STATISTICS===

 ---Document Counts---
	 Full dataset:    788,922 documents
	 Working sample:     10,000 documents
	 Sample fraction: 1.27% of full corpus

 ---Label Distribution (Sample)---
	 Human     4,452 documents (44.5%)
	 AI        5,548 documents (55.5%)

 ---Text Length Statistics (Sample, in words)---

 Human:
	 Mean: 700.8 words
	 Median: 386.5 words
	 Stdev: 1022.8 words
	 Min: 25 words
	 Max: 23392 words

 AI:
	 Mean: 335.9 words
	 Median: 283.0 words
	 Stdev: 279.3 words
	 Min: 25 words
	 Max: 5019 words

 ---Vocabulary Size (Sample)---
	 Unique tokens (raw):  185,152

# Part 2: Full Dataset Stats (pulled from distribution.csv)
    # distribution.csv has pre-computed stats for all 788k documents 
    # can use this to give an accurate picture of the full corpus in the writeup
print(f"\n── Full Dataset Source Distribution (from distribution.csv) ──")
dist_df = pd.read_csv(os.path.join(data_dir, 'distribution.csv'))
print(dist_df[['Source', 'Number of Samples', 'Percentage of Total Data', 'Word Count Mean', 'Word Count Median']].to_string(index=False))

── Full Dataset Source Distribution (from distribution.csv) ──
                   Source  Number of Samples Percentage of Total Data  Word Count Mean  Word Count Median
                    Human             347692                 44.0718%          710.335              396.0
                  GPT-3.5              52346                  6.6351%          432.380              505.0
         Text-Davinci-003              22860                  2.8976%          156.798              121.0
         Text-Davinci-002              21436                  2.7171%          159.292              107.0
                 OPT-1.3B              18467                  2.3408%          251.045              133.0
                  OPT-30B              18055                  2.2886%          223.312              129.0
  Nous-Hermes-LLaMA-2-13B              12686                  1.6080%          549.696              505.0
               Mistral-7B              10439                  1.3232%          374.923              380.0
                   PaLM-2               9510                  1.2054%          419.990              413.0
             OpenChat-3.5               9402                  1.1918%          616.382              607.0
                LLaMA-30B               9340                  1.1839%          393.633              336.0
                LLaMA-65B               9321                  1.1815%          391.059              331.0
                LLaMA-13B               9282                  1.1765%          435.495              415.0
                 LLaMA-7B               9271                  1.1751%          480.182              526.0
                    T0-3B               9219                  1.1686%           57.816               42.0
             Flan-T5-Base               9201                  1.1663%           44.999               38.0
            Flan-T5-Large               9164                  1.1616%           46.985               40.0
            Flan-T5-Small               9144                  1.1590%           39.586               37.0
                 OPT-2.7B               9134                  1.1578%          254.533              149.0
              Flan-T5-XXL               9113                  1.1551%           86.004               53.0
                 GLM-130B               9071                  1.1498%          529.154              646.0
               Flan-T5-XL               8986                  1.1390%           51.224               43.0
                    GPT-4               8852                  1.1220%          621.655              637.0
                 OPT-6.7B               8838                  1.1203%          256.804              153.0
                 OPT-125M               8823                  1.1184%          255.521              167.0
                 Bloom-7B               8812                  1.1170%          325.537              236.5
                 OPT-350M               8747                  1.1087%          326.137              220.0
                   T0-11B               8705                  1.1034%           49.660               40.0
                  OPT-13B               8087                  1.0251%          213.094              117.0
                    GPT-J               7580                  0.9608%          470.693              394.0
        Claude-Instant-v1               7147                  0.9059%          453.450              447.0
                 GPT-NeoX               6821                  0.8646%          140.098              106.0
          MythoMax-L2-13B               6147                  0.7792%          455.178              445.0
                  Unknown               6093                  0.7723%          272.031              284.0
           Neural-Chat-7B               5858                  0.7425%          636.037              634.0
                 LZLV-70B               5143                  0.6519%          437.645              447.0
              LLaMA-2-70B               5000                  0.6338%          422.705              420.0
              Falcon-180B               4745                  0.6015%          372.009              357.0
           Psyfighter-13B               4375                  0.5546%          560.545              563.0
     StripedHyena-Nous-7B               3520                  0.4462%          496.181              491.0
                   YI-34B               3520                  0.4462%          664.171              649.0
        Nous-Capybara-34B               3327                  0.4217%          541.708              540.0
         Nous-Capybara-7B               3204                  0.4061%          588.157              558.0
                Claude-v1               3158                  0.4003%          388.771              388.0
      Mistral-7B-OpenOrca               3058                  0.3876%          545.278              542.0
             Mixtral-8x7B               2865                  0.3632%          691.358              569.0
         Psyfighter-2-13B               2743                  0.3477%          571.315              572.0
             Noromaid-20B               1326                  0.1681%          476.931              479.0
         Text-Davinci-001               1120                  0.1420%          245.464              236.0
           Text-Curie-001               1008                  0.1278%          240.240              225.0
         Text-Babbage-001                875                  0.1109%          208.003              194.0
             Goliath-120B                734                  0.0930%          255.871              258.0
             Text-Ada-001                691                  0.0876%          230.527              193.0
  Nous-Hermes-LLaMA-2-70B                650                  0.0824%          803.945              745.0
  OpenHermes-2-Mistral-7B                623                  0.0790%          546.400              541.0
               Gemini-Pro                613                  0.0777%          579.868              566.0
OpenHermes-2.5-Mistral-7B                612                  0.0776%          570.420              566.0
          RWKV-5-World-3B                496                  0.0629%          391.734              383.0
               Toppy-M-7B                433                  0.0549%          616.993              598.0
               LLaMA-2-7B                409                  0.0518%          291.298              292.0
     Dolphin-Mixtral-8x7B                407                  0.0516%          628.459              592.0
           Cohere-Command                390                  0.0494%          333.787              312.5
 Dolphin-2.5-Mixtral-8x7B                228                  0.0289%          583.404              524.0

# Part 3: Visualizations 

fig, axes = plt.subplots(1, 4, figsize=(22, 5))
fig.suptitle('Corpus Statistics: Human vs. AI Text', fontsize=14, fontweight='bold')

# Plot 1: Label distribution bar chart
label_counts.plot(kind='bar', ax=axes[0], color=['steelblue', 'tomato'], edgecolor='black', alpha=0.85)
axes[0].set_title('Label Distribution (Sample)')
axes[0].set_xlabel('Source')
axes[0].set_ylabel('Number of Documents')
axes[0].set_xticklabels(['Human', 'AI'], rotation=0)

# Loop over each bar and add the raw count as a label just above it
for i, v in enumerate(label_counts):
    axes[0].text(i, v + 30, str(v), ha='center', fontweight='bold')

# Plot 2: Word count histogram -- Human only
cap = 2000  # trims extreme outliers (some human docs go up to 23k words) so histogram is readable
human_wc = ai_human_df_sample[ai_human_df_sample['label'] == 0]['word_count']
axes[1].hist(human_wc[human_wc <= cap], bins=40, color='steelblue', edgecolor='black', alpha=0.85)
axes[1].set_title('Human: Word Count Distribution')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].axvline(human_wc.median(), color='red', linestyle='--', label=f'Median: {human_wc.median():.0f}')
axes[1].legend()

# Plot 3: Word count histogram -- AI only
ai_wc = ai_human_df_sample[ai_human_df_sample['label'] == 1]['word_count']
axes[2].hist(ai_wc[ai_wc <= cap], bins=40, color='tomato', edgecolor='black', alpha=0.85)
axes[2].set_title('AI: Word Count Distribution')
axes[2].set_xlabel('Word Count')
axes[2].set_ylabel('Frequency')
axes[2].axvline(ai_wc.median(), color='navy', linestyle='--', label=f'Median: {ai_wc.median():.0f}')
axes[2].legend()

# Plot 4: Boxplot of word counts by label
ai_human_df_sample['label_name'] = ai_human_df_sample['label'].map({0: 'Human', 1: 'AI'})   # convert numeric label (0/1) to a readable string ('Human'/'AI')
ai_human_df_sample[ai_human_df_sample['word_count'] <= cap].boxplot(
    column='word_count', by='label_name', ax=axes[3],
    boxprops=dict(color='black'),
    medianprops=dict(color='red', linewidth=2)
)
axes[3].set_title(f'Word Count Boxplot (capped at {cap})')
axes[3].set_xlabel('Source')
axes[3].set_ylabel('Word Count')
plt.suptitle('')    # clears the default title that pandas boxplot auto-generates

plt.tight_layout()
plt.savefig('/Users/maddiemac/Desktop/INFO 230/cultural-analytics-project-1/figures/EDA_visuals')
plt.show()

# Chunking Strategy 1: Document-level chunking
    # simplest approach: each document is already a "chunk"
    # add chunk_id column
chunks_s1 = ai_human_df_sample[['text', 'source', 'label', 'word_count']].copy()
chunks_s1['doc_id'] = chunks_s1.index
chunks_s1['chunk_id'] = 0    # only one chunk per document

print("===Strategy 1: Document-Level Chunking===")
print(f"\t Total chunks: {len(chunks_s1):,}")
print(f"\t Mean chunk size: {chunks_s1['word_count'].mean():.1f} words")
print(f"\t Median chunk size: {chunks_s1['word_count'].median():.1f} words")
print(f"\t Stdev chunk size: {chunks_s1['word_count'].std():.1f} words")
print(f"\t Min chunk size: {chunks_s1['word_count'].min()} words")
print(f"\t Max chunk size: {chunks_s1['word_count'].max()} words")

# Chunking Strategy 2: Fixed-length word window chunking
    # each document is split into non-overlapping windows of CHUNK_SIZE words
        # NOTE: the final chunk of each document may be shorter than CHUNK_SIZE 
            # (minimum is 50 words): otherwise discard the chunk to avoid tiny (low-signal) chunks
CHUNK_SIZE = 200    # words per chunk 
MIN_CHUNK_WORDS = 50

def chunk_by_words(text, chunk_size=CHUNK_SIZE, min_words=MIN_CHUNK_WORDS):
    """
    Split a text string into non-overlapping chunks of chunk_size words.
    Discards the final chunk if it has fewer than min_words words.
    Returns a list of chunk strings.
    """
    words  = text.split()
    chunks = []
    for i in range(0, len(words), chunk_size):
        chunk = ' '.join(words[i:i + chunk_size])
        if len(chunk.split()) >= min_words:
            chunks.append(chunk)
    return chunks

# Apply chunking to every document in the sample (faster than appending rows to a DataFrame one at a time)
rows = []
for doc_id, row in ai_human_df_sample.iterrows():
    doc_chunks = chunk_by_words(row['text'])
    # inner loop: one iteration per chunk produced from this document
    for chunk_idx, chunk_text in enumerate(doc_chunks):
        rows.append({
            'doc_id' : doc_id,
            'chunk_id' : chunk_idx,
            'text' : chunk_text,
            'source' : row['source'],
            'label' : row['label'],
            'word_count': len(chunk_text.split())
        })

chunks_s2 = pd.DataFrame(rows)

print("\n===Strategy 2: Fixed-Length Word Window Chunking===")
print(f"\t Chunk size setting: {CHUNK_SIZE} words (min: {MIN_CHUNK_WORDS})")
print(f"\t Total chunks: {len(chunks_s2):,}")
print(f"\t Mean chunk size: {chunks_s2['word_count'].mean():.1f} words")
print(f"\t Median chunk size: {chunks_s2['word_count'].median():.1f} words")
print(f"\t Stdev chunk size: {chunks_s2['word_count'].std():.1f} words")
print(f"\t Min chunk size: {chunks_s2['word_count'].min()} words")
print(f"\t Max chunk size: {chunks_s2['word_count'].max()} words")

# STRATEGY COMPARISON 
    # Recall: a good chunking strategy for topic modeling should produce chunks that are:
        # (1) big enough (hundreds to thousands of chunks)
        # (2) consistent in size (low std relative to mean)
        # (3) long enough to carry topical signal (ideally 100-500 words)
# Use CV=stdev/mean (coefficient of variation) as a simple measure of size consistency 
    # lower CV = more uniform chunks 
cv_s1 = chunks_s1['word_count'].std() / chunks_s1['word_count'].mean()
cv_s2 = chunks_s2['word_count'].std() / chunks_s2['word_count'].mean()

print("\n===STRATEGY COMPARISON===")
print(f"\n{'Metric':<30} {'Strategy 1':>15} {'Strategy 2':>15}")
print(f"{'Total chunks':<30} {len(chunks_s1):>15,} {len(chunks_s2):>15,}")
print(f"{'Mean chunk size (words)':<30} {chunks_s1['word_count'].mean():>15.1f} {chunks_s2['word_count'].mean():>15.1f}")
print(f"{'Stdev chunk size (words)':<30} {chunks_s1['word_count'].std():>15.1f} {chunks_s2['word_count'].std():>15.1f}")
print(f"{'Coefficient of Variation':<30} {cv_s1:>15.3f} {cv_s2:>15.3f}")
print(f"{'Min chunk size (words)':<30} {chunks_s1['word_count'].min():>15} {chunks_s2['word_count'].min():>15}")
print(f"{'Max chunk size (words)':<30} {chunks_s1['word_count'].max():>15} {chunks_s2['word_count'].max():>15}")

===Strategy 1: Document-Level Chunking===
	 Total chunks: 10,000
	 Mean chunk size: 498.4 words
	 Median chunk size: 326.0 words
	 Stdev chunk size: 736.1 words
	 Min chunk size: 25 words
	 Max chunk size: 23392 words

===Strategy 2: Fixed-Length Word Window Chunking===
	 Chunk size setting: 200 words (min: 50)
	 Total chunks: 27,430
	 Mean chunk size: 178.6 words
	 Median chunk size: 200.0 words
	 Stdev chunk size: 42.5 words
	 Min chunk size: 50 words
	 Max chunk size: 200 words

===STRATEGY COMPARISON===

Metric                              Strategy 1      Strategy 2
Total chunks                            10,000          27,430
Mean chunk size (words)                  498.4           178.6
Stdev chunk size (words)                 736.1            42.5
Coefficient of Variation                 1.477           0.238
Min chunk size (words)                      25              50
Max chunk size (words)                   23392             200

# Visualize the comparison between chunking strategies 
fig, axes = plt.subplots(1, 2, figsize=(14, 5))
fig.suptitle('Chunking Strategy Comparison: Word Count Distributions', fontsize=13, fontweight='bold')

cap = 2000
# Strategy 1 histogram
axes[0].hist(chunks_s1['word_count'].clip(upper=cap), bins=40, color='mediumpurple', edgecolor='black', alpha=0.85)
axes[0].axvline(chunks_s1['word_count'].median(), color='red', linestyle='--', label=f"Median: {chunks_s1['word_count'].median():.0f}")
axes[0].set_title(f'Strategy 1: Document-Level\n(n={len(chunks_s1):,} chunks, CV={cv_s1:.2f})')
axes[0].set_xlabel('Word Count')
axes[0].set_ylabel('Frequency')
axes[0].legend()

# Strategy 2 histogram
axes[1].hist(chunks_s2['word_count'].clip(upper=cap), bins=40, color='mediumseagreen', edgecolor='black', alpha=0.85)
axes[1].axvline(chunks_s2['word_count'].median(), color='red', linestyle='--', label=f"Median: {chunks_s2['word_count'].median():.0f}")
axes[1].set_title(f'Strategy 2: Fixed 200-Word Windows\n(n={len(chunks_s2):,} chunks, CV={cv_s2:.2f})')
axes[1].set_xlabel('Word Count')
axes[1].set_ylabel('Frequency')
axes[1].legend()

plt.tight_layout()
plt.savefig('/Users/maddiemac/Desktop/INFO 230/cultural-analytics-project-1/figures/chunking_strategies')
plt.show()

# prior to discussing which strategy is better 
    # save BOTH chunked datasets for later classification and topic modeling 
        # so its easy to load either strategy cleanly
chunks_s1.to_csv(os.path.join(data_dir, 'chunks_strategy1.csv'), index=False)
chunks_s2.to_csv(os.path.join(data_dir, 'chunks_strategy2.csv'), index=False)
print("\nSaved: chunks_strategy1.csv")
print("Saved: chunks_strategy2.csv")

Saved: chunks_strategy1.csv
Saved: chunks_strategy2.csv

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import (classification_report, f1_score, ConfusionMatrixDisplay, confusion_matrix)

# (Workflow 1): Vectorize Text with TF-IDF
print("Vectorizing text with TF-IDF...")
tfidf = TfidfVectorizer(
    max_features = 10000,   # limits vocabulary to top 10k terms (efficiency)
    min_df = 2,   # ignore words appearing in only 1 chunk (likely noise)
    max_df = 0.95,    # ignore words in 95%+ of chunks (too common to be useful)
    stop_words = 'english',   
    ngram_range = (1, 2)   # ngram_range=(1,2): include both single words AND two-word phrases
)

X = tfidf.fit_transform(chunks_s2['text'])
y = chunks_s2['label']

print(f"\t TF-IDF matrix shape: {X.shape}")
print(f"\t Vocabulary size: {len(tfidf.vocabulary_):,} terms")

# (Workflow 2): Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size = 0.2,
    random_state = RANDOM_SEED,
    stratify = y    # ensures both Human and AI are proportionally represented in both training and test sets (fair evaluation)
)

print(f"\nTrain/Test Split:")
print(f"\t Training chunks: {X_train.shape[0]:,}")
print(f"\t Test chunks: {X_test.shape[0]:,}")
print(f"\t Train label dist: {dict(y_train.value_counts().rename({0:'Human', 1:'AI'}))}")
print(f"\t Test label dist: {dict(y_test.value_counts().rename({0:'Human', 1:'AI'}))}")

# (Workflow 3): Train Classifiers

# Model 1: Logistic Regression 
print("\nTraining Logistic Regression...")
lr = LogisticRegression(max_iter=1000, random_state=RANDOM_SEED)    # max_iter=1000 ensures the optimizer converges on our large feature space
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Model 2: Naive Bayes
print("Training Naive Bayes...")
nb = MultinomialNB()    # MultinomialNB is specifically designed for count/frequency features like TF-IDF
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# (Workflow 4): Evaluate: F1 Score & Classification Report
print("\n===Classification Results===")
for model_name, y_pred in [("Logistic Regression", y_pred_lr), ("Naïve Bayes", y_pred_nb)]:
    f1_macro = f1_score(y_test, y_pred, average='macro')
    f1_human = f1_score(y_test, y_pred, average=None)[0]
    f1_ai = f1_score(y_test, y_pred, average=None)[1]

    print(f"\n ---{model_name}---")
    print(f"\t Macro F1: {f1_macro:.4f}")
    print(f"\t F1 (Human): {f1_human:.4f}")
    print(f"\t F1 (AI): {f1_ai:.4f}")
    print(f"\n\t Full Classification Report:")
    print(classification_report(y_test, y_pred, target_names=['Human', 'AI']))

# (Workflow 5 - part 1): Interpret Results by visualizing with confusion matrices 
fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Confusion Matrices: Human vs. AI Classification', fontsize=13, fontweight='bold')

for ax, y_pred, title in zip(axes, [y_pred_lr, y_pred_nb], ['Logistic Regression', 'Naïve Bayes']):
    cm = confusion_matrix(y_test, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Human', 'AI'])
    disp.plot(ax=ax, colorbar=False, cmap='Blues')
    ax.set_title(title)

plt.tight_layout()
plt.savefig('/Users/maddiemac/Desktop/INFO 230/cultural-analytics-project-1/figures/confusion_matrices')
plt.show()

# (Workflow 5 - part 2): Interpret Results through top predictive features (logistic regression)
feature_names = tfidf.get_feature_names_out()
coefs = lr.coef_[0]
top_n = 15

top_ai_idx = coefs.argsort()[-top_n:][::-1]
top_human_idx = coefs.argsort()[:top_n]

print("\n ---Top 15 Features Predictive of AI Writing---")
for i in top_ai_idx:
    print(f"\t {feature_names[i]:<30} coef: {coefs[i]:.4f}")

print("\n ---Top 15 Features Predictive of Human Writing---")
for i in top_human_idx:
    print(f"\t {feature_names[i]:<30} coef: {coefs[i]:.4f}")

Vectorizing text with TF-IDF...
	 TF-IDF matrix shape: (27430, 10000)
	 Vocabulary size: 10,000 terms

Train/Test Split:
	 Training chunks: 21,944
	 Test chunks: 5,486
	 Train label dist: {'Human': np.int64(13372), 'AI': np.int64(8572)}
	 Test label dist: {'Human': np.int64(3343), 'AI': np.int64(2143)}

Training Logistic Regression...
Training Naive Bayes...

===Classification Results===

 ---Logistic Regression---
	 Macro F1: 0.8617
	 F1 (Human): 0.8975
	 F1 (AI): 0.8260

	 Full Classification Report:
              precision    recall  f1-score   support

       Human       0.87      0.93      0.90      3343
          AI       0.87      0.78      0.83      2143

    accuracy                           0.87      5486
   macro avg       0.87      0.86      0.86      5486
weighted avg       0.87      0.87      0.87      5486


 ---Naïve Bayes---
	 Macro F1: 0.8163
	 F1 (Human): 0.8636
	 F1 (AI): 0.7690

	 Full Classification Report:
              precision    recall  f1-score   support

       Human       0.84      0.89      0.86      3343
          AI       0.81      0.73      0.77      2143

    accuracy                           0.83      5486
   macro avg       0.82      0.81      0.82      5486
weighted avg       0.83      0.83      0.83      5486

 ---Top 15 Features Predictive of AI Writing---
	 potential                      coef: 4.8588
	 including                      coef: 4.4700
	 significant                    coef: 3.6509
	 additionally                   coef: 3.3520
	 explore                        coef: 3.3417
	 substeps                       coef: 3.2471
	 lead                           coef: 3.2338
	 ultimately                     coef: 3.1907
	 complex                        coef: 3.1578
	 profound                       coef: 3.1359
	 impact                         coef: 3.1236
	 leading                        coef: 2.9941
	 challenges                     coef: 2.8166
	 known                          coef: 2.5934
	 traditional                    coef: 2.5857

 ---Top 15 Features Predictive of Human Writing---
	 case                           coef: -4.0546
	 organization                   coef: -3.7293
	 2009                           coef: -3.5122
	 web                            coef: -3.3551
	 essay                          coef: -3.2501
	 market                         coef: -3.2103
	 2008                           coef: -3.1668
	 people                         coef: -3.1620
	 management                     coef: -3.0551
	 fact                           coef: -3.0159
	 2007                           coef: -2.9717
	 references                     coef: -2.9604
	 situation                      coef: -2.9498
	 cited                          coef: -2.8255
	 quite                          coef: -2.7917

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'  # for working with interactive visuals in vscode

feature_names = tfidf.get_feature_names_out()
coefs = lr.coef_[0]
top_n = 15

# Get top AI and Human predictive terms
top_ai_idx = coefs.argsort()[-top_n:][::-1]
top_human_idx = coefs.argsort()[:top_n]

# Build a combined dataframe for plotting
import pandas as pd
feature_df = pd.DataFrame({
    'term' : (list(feature_names[top_ai_idx]) + list(feature_names[top_human_idx])),
    'coef' : (list(coefs[top_ai_idx]) + list(coefs[top_human_idx])),
    'class' : (['AI'] * top_n + ['Human'] * top_n)
}).sort_values('coef')

fig = px.bar(
    feature_df,
    x = 'coef',
    y = 'term',
    color = 'class',
    orientation = 'h',
    color_discrete_map = {'AI': 'tomato', 'Human': 'steelblue'},
    title = 'Top 15 Features Predictive of Human vs. AI Writing (Logistic Regression)',
    labels = {'coef': 'Coefficient (positive = AI, negative = Human)', 'term': 'Term'}
)
fig.update_layout(
    height = 700,
    yaxis = {'categoryorder': 'total ascending'},
    showlegend = True,
    legend_title = 'Predicted Class'
)

plt.savefig('/Users/maddiemac/Desktop/INFO 230/cultural-analytics-project-1/figures/feature_importance_LR')
fig.show()

<Figure size 640x480 with 0 Axes>

from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer

def get_tfidf_weights(texts, max_features=3000):
    """
    Fit a TF-IDF vectorizer on a list of texts and return a dictionary of {term: mean_tfidf_score} for word cloud generation.
    """
    vec = TfidfVectorizer(
        max_features = max_features,
        stop_words = 'english',
        min_df = 2
    )
    matrix = vec.fit_transform(texts)
    names = vec.get_feature_names_out()
    mean_scores = matrix.mean(axis=0).A1
    return {names[i]: mean_scores[i] for i in range(len(names)) if mean_scores[i] > 0}

# Get texts for each class from Strategy 2 chunks
human_texts = chunks_s2[chunks_s2['label'] == 0]['text'].tolist()
ai_texts = chunks_s2[chunks_s2['label'] == 1]['text'].tolist()

print(f"Generating word clouds from {len(human_texts):,} Human chunks and {len(ai_texts):,} AI chunks...")

human_weights = get_tfidf_weights(human_texts)
ai_weights = get_tfidf_weights(ai_texts)

# Generate word clouds
wc_human = WordCloud(
    width = 1200,
    height = 600,
    background_color = 'white',
    max_words = 150,
    colormap = 'Blues',
    min_font_size = 10
).generate_from_frequencies(human_weights)

wc_ai = WordCloud(
    width = 1200,
    height = 600,
    background_color = 'white',
    max_words = 150,
    colormap = 'Reds',
    min_font_size = 10
).generate_from_frequencies(ai_weights)

# Display side by side
fig, axes = plt.subplots(1, 2, figsize=(20, 8))

axes[0].imshow(wc_human, interpolation='bilinear')
axes[0].axis('off')
axes[0].set_title('Human Writing — TF-IDF Weighted Word Cloud', fontsize=14, fontweight='bold', color='steelblue')

axes[1].imshow(wc_ai, interpolation='bilinear')
axes[1].axis('off')
axes[1].set_title('AI Writing — TF-IDF Weighted Word Cloud', fontsize=14, fontweight='bold', color='tomato')

plt.suptitle('What words define Human vs. AI writing?', fontsize=16, fontweight='bold', y=1.02)
plt.tight_layout()
plt.savefig('/Users/maddiemac/Desktop/INFO 230/cultural-analytics-project-1/figures/wordcloud')
plt.show()

Generating word clouds from 16,715 Human chunks and 10,715 AI chunks...

# Install required packages for topic modeling 
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'  # for working with interactive visuals in vscode


print("All topic modeling imports successful!")

All topic modeling imports successful!

# ===Prepare text inputs for all 4 model/strategy combinations===
    # extract the raw text lists from each chunking strategy
        # BERTopic works directly on raw text (handles vectorization internally)
        # LDA requires a CountVectorizer step first (bag-of-words counts)

# Chunking Strategy 1: document-level chunks 
texts_s1 = chunks_s1['text'].tolist()
labels_s1 = chunks_s1['label'].tolist()

# Chunking Strategy 2: fixed 200-word window chunks
texts_s2 = chunks_s2['text'].tolist()
labels_s2 = chunks_s2['label'].tolist()

print(f"Chunking Strategy 1: {len(texts_s1): } documents")
print(f"Chunking Strategy 2: {len(texts_s2): } documents")

# ===CountVectorizor for LDA===
    # LDA requires integer word counts, not just raw text 
    # use the same parameters as TF-IDF vectorizer for consistency 
        # but with CountVectorizer (since LDA needs raw counts not TF-IDF weights)
print("\nFitting CountVectorizer for LDA:")
count_vec = CountVectorizer(
    max_features= 5000,  # keep vocabulary manageable for LDA
    min_df= 5,   # ignore very rare words (noise)
    max_df= 0.95,    # # ignore words in 95%+ of docs (too common)
    stop_words= 'english'
)

# Fit on Chunking Strategy 2 (the primary chunking strategy from earlier)
    # use .fit() on S2 and then .transform() so vocabulary is consistent 
count_vec.fit(texts_s2)
dtm_s1 = count_vec.transform(texts_s1)  # document-term matrix for S1
dtm_s2 = count_vec.transform(texts_s2)  # document-term matrix for S1

print(f"\tTDM Chunking Strategy 1 shape: {dtm_s1.shape}")
print(f"\tTDM Chunking Strategy 2 shape: {dtm_s2.shape}")
print(f"\tVocabulary size: {len(count_vec.vocabulary_): } terms")

Chunking Strategy 1:  10000 documents
Chunking Strategy 2:  27430 documents

Fitting CountVectorizer for LDA:
	TDM Chunking Strategy 1 shape: (10000, 5000)
	TDM Chunking Strategy 2 shape: (27430, 5000)
	Vocabulary size:  5000 terms

print("===Model 1a: BERTopic on Chunking Strategy 1 (document-level chunks)===")
print("Fitting BERTopic: note this may take a few minutes to complete")

representation_models = {
    "KeyBERT" : KeyBERTInspired(),  # uses keyword extraction to label topics meaningfully
    "MMR": MaximalMarginalRelevance(diversity=0.3)  # reduces redundancy in topic keywords
}

bertopic_s1 = BERTopic(
    nr_topics=15,   # ask for 15 topics (enough to find meaningful clusters)
    min_topic_size=20,  # topic must have at least 20 chunks to be valid (prevents tiny, noisy micro-topics)
    representation_model=representation_models,
    verbose=True    # show progress (so I know this is running okay)
)

topics_bert_s1, probs_bert_s1 = bertopic_s1.fit_transform(texts_s1)
chunks_s1 = chunks_s1.copy()
chunks_s1['topic_bert'] = topics_bert_s1

# Explore output
topic_info_bert_s1 = bertopic_s1.get_topic_info()
print(f"\nTopics found: {len(topic_info_bert_s1[topic_info_bert_s1['Topic'] >= 0])}")
print(f"Outlier chunks (-1): {sum(t == -1 for t in topics_bert_s1):,}")    # BERTopic assigns -1 (outlier) to chunks it can't confidently assign (still keep track of)
print(f"\nTopic summary (KeyBERT labels):")

label_cols = ['Topic', 'Count'] + [c for c in ['Name', 'KeyBERT', 'MMR'] if c in topic_info_bert_s1.columns]
print(topic_info_bert_s1[topic_info_bert_s1['Topic'] >= 0][label_cols].to_string(index=False))

# Save model for reuse (follow Professor Tim's method in notebooks)
bertopic_s1.save("bertopic_s1_model")
print("\nModel saved: bertopic_s1_model/")

2026-02-25 17:19:54,313 - BERTopic - Embedding - Transforming documents to embeddings.

===Model 1a: BERTopic on Chunking Strategy 1 (document-level chunks)===
Fitting BERTopic: note this may take a few minutes to complete

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

2026-02-25 17:20:45,219 - BERTopic - Embedding - Completed ✓
2026-02-25 17:20:45,219 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-25 17:20:53,291 - BERTopic - Dimensionality - Completed ✓
2026-02-25 17:20:53,292 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-25 17:20:53,447 - BERTopic - Cluster - Completed ✓
2026-02-25 17:20:53,447 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-02-25 17:20:54,876 - BERTopic - Representation - Completed ✓
2026-02-25 17:20:54,878 - BERTopic - Topic reduction - Reducing number of topics
2026-02-25 17:20:54,884 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-25 17:20:58,391 - BERTopic - Representation - Completed ✓
2026-02-25 17:20:58,414 - BERTopic - Topic reduction - Reduced number of topics from 78 to 15
2026-02-25 17:20:58,865 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.

Topics found: 14
Outlier chunks (-1): 3,255

Topic summary (KeyBERT labels):
 Topic  Count                                Name                                                                                                         KeyBERT                                                                                         MMR
     0   1859                     0_the_to_of_and                    [education, organization, management, be, students, being, business, life, school, learning]                                          [to, of, be, as, are, their, on, students, an, at]
     1   1199                     1_the_and_of_to                                          [nature, characters, art, light, story, life, human, about, who, what]                                         [of, to, her, she, his, with, time, their, be, our]
     2    848                    2_was_the_and_to                                [restaurant, food, pizza, chicken, delicious, place, cheese, service, bowl, get]                                   [to, it, food, is, place, at, great, were, have, service]
     3    606                     3_the_of_and_to [sustainable, pollution, economic, driverless, climate, environmental, driving, cars, transportation, benefits]                             [of, are, cars, can, be, car, driverless, energy, water, their]
     4    461                     4_and_the_to_of    [technology, technologies, communication, internet, social, privacy, intelligence, media, computer, digital]              [to, technology, can, media, social, their, information, by, individuals, use]
     5    438                     5_the_of_and_to                     [nursing, nurses, patients, nurse, medical, patient, healthcare, health, hospital, ethical]                        [to, health, care, are, genetic, patients, or, medical, nursing, an]
     6    262                      6_we_the_of_to                      [learning, features, recognition, datasets, dataset, networks, neural, models, deep, data]         [in, learning, neural, data, problem, models, features, deep, networks, algorithms]
     7    259                     7_the_to_in_and                            [sports, olympics, olympic, sport, soccer, football, athletes, ball, england, match]                             [of, on, at, games, his, sports, soccer, be, football, players]
     8    240                      8_the_of_to_in                          [offenders, crime, justice, criminal, punishment, court, police, law, cases, economic]                              [of, to, in, was, be, police, european, criminal, justice, eu]
     9    220                   9_your_you_the_to                                                 [skin, hair, wear, wash, face, using, wearing, dry, dress, use]                               [your, title, or, hair, step, can, substeps, be, skin, color]
    10    129              10_venus_face_mars_the                           [mars, martian, geological, planets, landform, cydonia, face, surface, earth, earths]          [venus, mars, planet, landform, earth, nasa, surface, aliens, geological, planets]
    11    124 11_electoral_college_vote_president                      [electoral, electors, voting, elections, election, voters, elected, vote, electing, votes] [electoral, vote, president, election, system, candidate, voters, electors, would, elected]
    12     74                     12_the_in_of_to             [antennas, antenna, wireless, communications, broadcasting, channel, radio, network, rf, streaming]    [antenna, antennas, network, radio, energy, technology, channel, wncu, we, transmission]
    13     26           13_bowl_super_broncos_the                             [nfl, seahawks, steelers, afc, colts, patriots, manning, broncos, quarterback, nfc]            [bowl, broncos, nfl, yards, steelers, seahawks, afc, quarterback, 2016, manning]

Model saved: bertopic_s1_model/

print("===Model 1b: BERTopic on Chunking Strategy 2 (200-word window chunks)===")
print("Fitting BERTopic: note this may take a few minutes to complete")

bertopic_s2 = BERTopic(
    nr_topics=15,   # ask for 15 topics (enough to find meaningful clusters)
    min_topic_size=50,  # slightly higher to account for larger corpus 
    representation_model=representation_models,
    verbose=True    # show progress (so I know this is running okay)
)

topics_bert_s2, probs_bert_s2 = bertopic_s2.fit_transform(texts_s2)
chunks_s2 = chunks_s2.copy()
chunks_s2['topic_bert'] = topics_bert_s2

# Explore output
topic_info_bert_s2 = bertopic_s2.get_topic_info()
print(f"\nTopics found: {len(topic_info_bert_s2[topic_info_bert_s2['Topic'] >= 0])}")
print(f"Outlier chunks (-1): {sum(t == -1 for t in topics_bert_s2):,}")    # BERTopic assigns -1 (outlier) to chunks it can't confidently assign (still keep track of)
print(f"\nTopic summary (KeyBERT labels):")

label_cols = ['Topic', 'Count'] + [c for c in ['Name', 'KeyBERT', 'MMR'] if c in topic_info_bert_s2.columns]
print(topic_info_bert_s2[topic_info_bert_s2['Topic'] >= 0][label_cols].to_string(index=False))

# Save model for reuse (follow Professor Tim's method in notebooks)
bertopic_s2.save("bertopic_s2_model")
print("\nModel saved: bertopic_s2_model/")

2026-02-25 17:22:09,478 - BERTopic - Embedding - Transforming documents to embeddings.

===Model 1b: BERTopic on Chunking Strategy 2 (200-word window chunks)===
Fitting BERTopic: note this may take a few minutes to complete

Batches:   0%|          | 0/858 [00:00<?, ?it/s]

2026-02-25 17:24:44,410 - BERTopic - Embedding - Completed ✓
2026-02-25 17:24:44,411 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-25 17:24:48,150 - BERTopic - Dimensionality - Completed ✓
2026-02-25 17:24:48,155 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-25 17:24:50,352 - BERTopic - Cluster - Completed ✓
2026-02-25 17:24:50,353 - BERTopic - Representation - Extracting topics using c-TF-IDF for topic reduction.
2026-02-25 17:24:52,009 - BERTopic - Representation - Completed ✓
2026-02-25 17:24:52,011 - BERTopic - Topic reduction - Reducing number of topics
2026-02-25 17:24:52,021 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-25 17:24:54,758 - BERTopic - Representation - Completed ✓
2026-02-25 17:24:54,761 - BERTopic - Topic reduction - Reduced number of topics from 105 to 15
2026-02-25 17:24:55,042 - BERTopic - WARNING: When you use `pickle` to save/load a BERTopic model,please make sure that the environments in which you saveand load the model are **exactly** the same. The version of BERTopic,its dependencies, and python need to remain the same.

Topics found: 14
Outlier chunks (-1): 10,799

Topic summary (KeyBERT labels):
 Topic  Count                         Name                                                                                                             KeyBERT                                                                                         MMR
     0   3537              0_the_of_and_to                                           [art, culture, cultural, society, war, life, power, world, social, being]                                           [of, to, as, by, their, be, his, were, an, which]
     1   2631              1_and_the_to_of                       [technology, benefits, social, cars, transportation, public, car, information, health, media]                          [and, to, can, be, social, cars, their, technology, people, media]
     2   2577        2_to_and_the_students                      [students, student, school, schools, education, activities, teachers, classes, children, life]                              [to, students, of, in, school, be, their, on, education, will]
     3   2291              3_the_of_to_and [management, managers, organization, organizations, business, company, companies, development, employees, strategy]          [to, in, be, management, their, company, employees, project, market, organization]
     4   1849            4_was_the_she_and                                                           [room, she, very, good, had, food, her, what, you, didnt]                                           [she, to, my, had, his, of, they, on, were, back]
     5    913              5_the_of_to_and                             [ethical, genetic, animals, animal, rights, gene, research, health, medicine, diseases]       [to, genetic, animals, health, ethical, testing, cancer, patients, research, disease]
     6    818              6_the_of_and_is                                 [cosmos, mars, nasa, space, universe, planets, earth, martian, planet, exploration]                         [our, venus, mars, universe, space, dreams, its, earth, planet, be]
     7    557              7_the_to_in_and                                   [football, players, soccer, league, sports, stadium, ball, sport, teams, england]                                 [to, of, on, at, games, be, sports, soccer, season, league]
     8    531               8_the_of_to_we                   [learning, networks, neural, recognition, algorithms, deep, training, models, network, computing]        [we, data, network, networks, algorithm, model, wireless, learning, neural, problem]
     9    306 9_electoral_college_vote_the                             [electoral, electors, voter, elections, voters, voting, election, elected, vote, votes]     [electoral, vote, president, states, election, system, votes, voters, electors, voting]
    10    189           10_your_you_to_and                                            [wash, using, face, clean, skin, wear, use, tablespoon, disposable, dry]                   [your, skin, hair, fashion, use, step, dress, substeps, clothing, diaper]
    11    176           11_light_the_of_is                 [lighting, light, photography, electricity, colors, wavelengths, energy, electrical, color, vision]                [light, electrons, can, color, energy, on, be, circuit, colors, electricity]
    12    139  12_oil_wastewater_water_the               [refinery, oil, reservoirs, wastewater, fluids, viscosity, reservoir, fluid, refineries, hydrocarbon] [oil, wastewater, reservoir, gas, pressure, permeability, recovery, fluid, refinery, crude]
    13    117    13_the_european_turkey_of                          [eu, eus, european, europe, constitutional, parliament, government, commission, court, ec]       [european, turkey, eu, union, rights, fundamental, court, national, economic, treaty]

Model saved: bertopic_s2_model/

print("===Model 2a: LDA: Chunking Strategy 1 (document-level chunks)===")
print("Fitting LDA:")

lda_s1 = LatentDirichletAllocation(
    n_components=15,    # match BERTopic's topic count for fair comparison
    max_iter= 20,    # number of EM algorithm iterations (more = better but slower)
    random_state=RANDOM_SEED,    # reproducibility
    learning_method='online',    # more efficient for larger datasets than 'batch'
    verbose=1
)

doc_topic_lda_s1 = lda_s1.fit_transform(dtm_s1)
chunks_s1['topic_lda'] = doc_topic_lda_s1.argmax(axis=1)

# print top words by topic 
    # for LDA topic labels are defined by the highest-probability words 
vocab = count_vec.get_feature_names_out()

def get_lda_top_words(model, vocab, n_top=10):
    """Extract top n words for each LDA topic."""
    topics = {}
    for idx, topic in enumerate(model.components_):
        top_words = [vocab[i] for i in topic.argsort()[-n_top:][::-1]]
        topics[idx] = top_words
    return topics

lda_s1_topics = get_lda_top_words(lda_s1, vocab)

print(f"\nLDA Topics (Strategy 1) — Top 10 words per topic:")
for topic_id, words in lda_s1_topics.items():
    print(f"\tTopic {topic_id:>2}: {' | '.join(words)}")

===Model 2a: LDA: Chunking Strategy 1 (document-level chunks)===
Fitting LDA:
iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20

LDA Topics (Strategy 1) — Top 10 words per topic:
	Topic  0: students | school | education | learning | children | community | help | time | student | work
	Topic  1: technology | human | world | time | new | potential | face | digital | space | earth
	Topic  2: people | individuals | social | individual | life | self | behavior | positive | personal | person
	Topic  3: government | states | war | political | united | people | world | country | new | american
	Topic  4: use | ethical | law | privacy | animals | animal | police | legal | rights | case
	Topic  5: cultural | women | people | culture | society | art | social | life | world | music
	Topic  6: health | care | patients | patient | medical | water | al | treatment | et | healthcare
	Topic  7: college | electoral | states | vote | president | popular | people | election | state | program
	Topic  8: like | just | time | people | don | know | good | said | think | want
	Topic  9: year | 000 | games | said | years | 10 | game | 12 | team | 15
	Topic 10: car | cars | people | public | city | driverless | driving | air | transportation | traffic
	Topic 11: company | business | market | products | marketing | companies | customers | services | financial | industry
	Topic 12: research | data | used | study | based | information | use | analysis | different | using
	Topic 13: economic | energy | environmental | global | countries | resources | food | development | sustainable | impact
	Topic 14: management | employees | organization | project | work | performance | leadership | organizations | process | communication

print("===Model 2b: LDA: Chunking Strategy 2 (200-word window chunks)===")
print("Fitting LDA:")

lda_s2 = LatentDirichletAllocation(
    n_components=15,    # match BERTopic's topic count for fair comparison
    max_iter= 20,    # number of EM algorithm iterations (more = better but slower)
    random_state=RANDOM_SEED,    # reproducibility
    learning_method='online',    # more efficient for larger datasets than 'batch'
    verbose=1
)

doc_topic_lda_s2 = lda_s2.fit_transform(dtm_s2)
chunks_s2['topic_lda'] = doc_topic_lda_s2.argmax(axis=1)

lda_s2_topics = get_lda_top_words(lda_s2, vocab)

print(f"\nLDA Topics (Strategy 2) — Top 10 words per topic:")
for topic_id, words in lda_s2_topics.items():
    print(f"\tTopic {topic_id:>2}: {' | '.join(words)}")

===Model 2b: LDA: Chunking Strategy 2 (200-word window chunks)===
Fitting LDA:
iteration: 1 of max_iter: 20
iteration: 2 of max_iter: 20
iteration: 3 of max_iter: 20
iteration: 4 of max_iter: 20
iteration: 5 of max_iter: 20
iteration: 6 of max_iter: 20
iteration: 7 of max_iter: 20
iteration: 8 of max_iter: 20
iteration: 9 of max_iter: 20
iteration: 10 of max_iter: 20
iteration: 11 of max_iter: 20
iteration: 12 of max_iter: 20
iteration: 13 of max_iter: 20
iteration: 14 of max_iter: 20
iteration: 15 of max_iter: 20
iteration: 16 of max_iter: 20
iteration: 17 of max_iter: 20
iteration: 18 of max_iter: 20
iteration: 19 of max_iter: 20
iteration: 20 of max_iter: 20

LDA Topics (Strategy 2) — Top 10 words per topic:
	Topic  0: time | year | said | years | day | began | music | old | took | man
	Topic  1: information | process | patients | development | care | theory | research | behavior | project | patient
	Topic  2: media | social | introduction | paper | essay | literature | people | works | history | book
	Topic  3: like | people | just | know | good | time | don | think | make | want
	Topic  4: health | potential | use | energy | environmental | public | urban | concerns | benefits | ethical
	Topic  5: government | country | countries | economic | political | law | state | rights | states | china
	Topic  6: women | water | al | et | body | woman | pressure | men | treatment | high
	Topic  7: students | school | education | learning | college | help | time | student | children | skills
	Topic  8: human | time | face | language | life | earth | space | nature | scientific | science
	Topic  9: world | social | cultural | individuals | people | new | technology | role | culture | impact
	Topic 10: people | united | war | cars | states | car | society | life | family | children
	Topic 11: company | market | business | services | products | companies | customers | marketing | industry | new
	Topic 12: data | analysis | study | research | used | case | problem | number | based | information
	Topic 13: organization | employees | management | work | performance | goals | needs | success | ensure | team
	Topic 14: web | new | 2011 | university | journal | management | york | 2012 | 2010 | leadership

# add a function to create readable labels 
def get_bertopic_labels(model, style="KeyBERT", n_words=3):
    topic_info = model.get_topic_info()
    topic_info = topic_info[topic_info["Topic"] >= 0]
    labels = {}
    for _, row in topic_info.iterrows():
        topic_id = row["Topic"]
        if style in topic_info.columns and isinstance(row[style], list):
            label = " | ".join(str(w) for w in row[style][:n_words])
        else:
            label = str(row["Name"])[:40]
        labels[topic_id] = f"T{topic_id}: {label}"
    return labels

bert_s1_labels = get_bertopic_labels(bertopic_s1, style="KeyBERT", n_words=3)
bert_s2_labels = get_bertopic_labels(bertopic_s2, style="KeyBERT", n_words=3)

# manually create LDA labels 
lda_s1_labels = {
    0: "T0: Education & School",    1: "T1: Technology & Space",
    2: "T2: Social & Personal",     3: "T3: Government & War",
    4: "T4: Law & Ethics",          5: "T5: Culture & Art",
    6: "T6: Health & Medicine",     7: "T7: Electoral College",
    8: "T8: Casual Conversation",   9: "T9: Sports & Numbers",
    10: "T10: Cars & Transport",    11: "T11: Business & Market",
    12: "T12: Research & Data",     13: "T13: Environment & Energy",
    14: "T14: Management & Work"
}

lda_s2_labels = {
    0: "T0: Narrative & Time",      1: "T1: Research & Patients",
    2: "T2: Media & Academic",      3: "T3: Casual Conversation",
    4: "T4: Health & Environment",  5: "T5: Government & Politics",
    6: "T6: Women & Medicine",      7: "T7: Education & School",
    8: "T8: Science & Nature",      9: "T9: Culture & Technology",
    10: "T10: Society & Family",    11: "T11: Business & Market",
    12: "T12: Data & Research",     13: "T13: Management & Teams",
    14: "T14: Academic Citations"
}

def build_label_topic_matrix(chunks_df, topic_col, topic_labels):
    """
    Build a normalized matrix showing how Human vs. AI chunks distribute across topics. 
    Rows = label (Human/AI), Columns = topic. 
    Values = proportion of that label's chunks in each topic.
    """
    df = chunks_df[chunks_df[topic_col] >= 0].copy()
    df['label_name'] = df['label'].map({0: 'Human', 1: 'AI'})
    matrix = (
        df.groupby(['label_name', topic_col])
        .size()
        .unstack(fill_value=0)
    )
    # Normalize each row so values = proportion (sum to 1 per label)
    matrix_norm = matrix.div(matrix.sum(axis=1), axis=0)
    matrix_norm.columns = [
        topic_labels.get(col, f"T{col}: Unknown") for col in matrix_norm.columns
    ]
    return matrix_norm

# Build all 4 matrices
mat_bert_s1_labeled = build_label_topic_matrix(chunks_s1, "topic_bert", bert_s1_labels)
mat_bert_s2_labeled = build_label_topic_matrix(chunks_s2, "topic_bert", bert_s2_labels)
mat_lda_s1_labeled  = build_label_topic_matrix(chunks_s1, "topic_lda",  lda_s1_labels)
mat_lda_s2_labeled  = build_label_topic_matrix(chunks_s2, "topic_lda",  lda_s2_labels)

# ===Interactive Plotly Heatmaps===
    # hover over each cell to see exact proportion values
for mat, title in [
    (mat_bert_s1_labeled, "BERTopic: Strategy 1 (Document-Level)"),
    (mat_bert_s2_labeled, "BERTopic: Strategy 2 (200-Word Windows)"),
    (mat_lda_s1_labeled, "LDA: Strategy 1 (Document-Level)"),
    (mat_lda_s2_labeled, "LDA: Strategy 2 (200-Word Windows)")
]:
    # Add topic labels for BERTopic versions
    # col_labels = list(mat.columns)
    
    fig = px.imshow(
        mat.values,
        x = list(mat.columns),
        y = list(mat.index),
        color_continuous_scale = 'YlOrRd',
        title = f'Human vs. AI Topic Distribution: {title}',
        labels = dict(color='Proportion of chunks'),
        aspect = 'auto'
    )
    fig.update_layout(
        height = 400,
        xaxis_title = 'Topic',
        yaxis_title = 'Label'
    )
    fig.show()

chunks_s1.to_csv(os.path.join(data_dir, 'chunks_s1_with_topics.csv'), index=False)
chunks_s2.to_csv(os.path.join(data_dir, 'chunks_s2_with_topics.csv'), index=False)

print("Saved: chunks_s1_with_topics.csv")
print("Saved: chunks_s2_with_topics.csv")
print(f"\nchunks_s1 columns: {list(chunks_s1.columns)}")
print(f"chunks_s2 columns: {list(chunks_s2.columns)}")

Saved: chunks_s1_with_topics.csv
Saved: chunks_s2_with_topics.csv

chunks_s1 columns: ['text', 'source', 'label', 'word_count', 'doc_id', 'chunk_id', 'topic_bert', 'topic_lda']
chunks_s2 columns: ['doc_id', 'chunk_id', 'text', 'source', 'label', 'word_count', 'topic_bert', 'topic_lda']

# Visualize top topics as bar chart (Model 1a)
bertopic_s1.visualize_barchart(top_n_topics=15)

# Visualize top topics as bar chart (Model 2a)
bertopic_s2.visualize_barchart(top_n_topics=15)

from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import chi2_contingency
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# EVALUATION 1: Topic Interpretability Comparison
    # Side-by-side comparison of BERTopic KeyBERT labels vs LDA top words for both chunking strategies 

print("===Evaluation 1: Topic Interpretability (BERTopic vs LDA)===")

def get_bertopic_label_list(model, n_words=5):
    """Get list of (topic_id, label_string) from BERTopic KeyBERT labels."""
    topic_info = model.get_topic_info()
    topic_info = topic_info[topic_info['Topic'] >= 0]
    results = []
    for _, row in topic_info.iterrows():
        tid = row['Topic']
        if 'KeyBERT' in topic_info.columns and isinstance(row['KeyBERT'], list):
            label = ' | '.join(str(w) for w in row['KeyBERT'][:n_words])
        else:
            label = str(row['Name'])[:50]
        results.append((tid, label))
    return results

def get_lda_label_list(model, vocab, n_words=5):
    """Get list of (topic_id, label_string) from LDA top words."""
    results = []
    for idx, topic in enumerate(model.components_):
        top_words = [vocab[i] for i in topic.argsort()[-n_words:][::-1]]
        results.append((idx, ' | '.join(top_words)))
    return results

vocab_list = count_vec.get_feature_names_out()

bert_s1_labels_list = get_bertopic_label_list(bertopic_s1)
bert_s2_labels_list = get_bertopic_label_list(bertopic_s2)
lda_s1_labels_list = get_lda_label_list(lda_s1, vocab_list)
lda_s2_labels_list = get_lda_label_list(lda_s2, vocab_list)

# Build comparison dataframes
print("\n---Strategy 1 (Document-Level): BERTopic vs LDA---")
max_len = max(len(bert_s1_labels_list), len(lda_s1_labels_list))
comp_s1 = pd.DataFrame({
    'Topic' : [f"T{i}" for i in range(max_len)],
    'BERTopic (KeyBERT)' : [l for _, l in bert_s1_labels_list] + ['—'] * (max_len - len(bert_s1_labels_list)),
    'LDA (Top Words)' : [l for _, l in lda_s1_labels_list] + ['—'] * (max_len - len(lda_s1_labels_list))
})
print(comp_s1.to_string(index=False))

print("\n---Strategy 2 (200-Word Windows): BERTopic vs LDA---")
max_len = max(len(bert_s2_labels_list), len(lda_s2_labels_list))
comp_s2 = pd.DataFrame({
    'Topic' : [f"T{i}" for i in range(max_len)],
    'BERTopic (KeyBERT)' : [l for _, l in bert_s2_labels_list] + ['—'] * (max_len - len(bert_s2_labels_list)),
    'LDA (Top Words)' : [l for _, l in lda_s2_labels_list] + ['—'] * (max_len - len(lda_s2_labels_list))
})
print(comp_s2.to_string(index=False))

# EVALUATION 2: Topic Diversity
    # Proportion of unique words across all topic top-word lists (same words keep appearing across topics, then diversity is low)
    # use top 10 words per topic for this calculation \

print("===Evaluation 2: Topic Diversity===")

def compute_diversity(topic_words_list):
    """
    Compute topic diversity: proportion of unique words across all topics.
    topic_words_list: list of lists of top words per topic.
    Range: 0 (all topics identical) to 1 (no word shared across topics).
    """
    all_words = [w for topic in topic_words_list for w in topic]
    unique_words = set(all_words)
    return len(unique_words) / len(all_words) if all_words else 0.0

# Extract top 10 words per topic as lists of lists
def bertopic_words_list(model, n=10):
    topic_info = model.get_topic_info()
    topic_ids  = topic_info[topic_info['Topic'] >= 0]['Topic'].tolist()
    return [[w for w, _ in model.get_topic(tid)][:n] for tid in topic_ids]

def lda_words_list(model, vocab, n=10):
    return [[vocab[i] for i in topic.argsort()[-n:][::-1]] 
            for topic in model.components_]

bert_s1_wl = bertopic_words_list(bertopic_s1)
bert_s2_wl = bertopic_words_list(bertopic_s2)
lda_s1_wl = lda_words_list(lda_s1, vocab_list)
lda_s2_wl = lda_words_list(lda_s2, vocab_list)

div_bert_s1 = compute_diversity(bert_s1_wl)
div_bert_s2 = compute_diversity(bert_s2_wl)
div_lda_s1 = compute_diversity(lda_s1_wl)
div_lda_s2 = compute_diversity(lda_s2_wl)

print(f"\nBERTopic S1 (Doc-Level) diversity: {div_bert_s1:.3f}")
print(f"\tBERTopic S2 (200-Word) diversity: {div_bert_s2:.3f}")
print(f"\tLDA S1 (Doc-Level) diversity: {div_lda_s1:.3f}")
print(f"\tLDA S2 (200-Word) diversity: {div_lda_s2:.3f}")

# EVALUATION 3: Topic Distinctiveness via Cosine Similarity
    # Represent each topic as a TF-IDF vector over its top words then compute pairwise cosine similarity between all topics
        # Low average similarity = topics are well separated from each other
        # High average similarity = topics are redundant / overlapping

print("===Evaluation 3: Topic Distinctiveness (Cosine Similarity)===")

def topic_similarity_matrix(topic_words_list, vectorizer):
    """
    Build a cosine similarity matrix between topics.
    Each topic is represented as a TF-IDF vector of its top words joined as text.
    Returns the matrix and the mean off-diagonal similarity.
    """
    # Join each topic's words into a pseudo-document
    topic_docs = [' '.join(words) for words in topic_words_list]
    
    # Vectorize using existing fitted TF-IDF vectorizer
        # (transform only: vocab already fitted on corpus)
    vecs = vectorizer.transform(topic_docs)
    sim_matrix = cosine_similarity(vecs)
    
    # Mean off-diagonal similarity (excluding self-similarity on diagonal)
    n = sim_matrix.shape[0]
    mask = ~np.eye(n, dtype=bool)
    mean_sim = sim_matrix[mask].mean()
    
    return sim_matrix, mean_sim

sim_bert_s1, mean_bert_s1 = topic_similarity_matrix(bert_s1_wl, tfidf)
sim_bert_s2, mean_bert_s2 = topic_similarity_matrix(bert_s2_wl, tfidf)
sim_lda_s1, mean_lda_s1 = topic_similarity_matrix(lda_s1_wl, tfidf)
sim_lda_s2, mean_lda_s2 = topic_similarity_matrix(lda_s2_wl, tfidf)

print(f"\nMean inter-topic similarity (lower = more distinct topics):")
print(f"\tBERTopic S1 (Doc-Level): {mean_bert_s1:.4f}")
print(f"\tBERTopic S2 (200-Word): {mean_bert_s2:.4f}")
print(f"\tLDA S1 (Doc-Level): {mean_lda_s1:.4f}")
print(f"\tLDA S2 (200-Word): {mean_lda_s2:.4f}")

# Visualize similarity matrices as heatmaps
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle('Topic Similarity Matrices\n(darker = more similar topics, ''lower avg = more distinct)', fontsize=13, fontweight='bold')

for ax, sim, title, mean in zip(
    axes.flatten(),
    [sim_bert_s1, sim_bert_s2, sim_lda_s1, sim_lda_s2],
    ['BERTopic: S1 (Doc-Level)', 'BERTopic: S2 (200-Word)',
     'LDA: S1 (Doc-Level)', 'LDA: S2 (200-Word)'],
    [mean_bert_s1, mean_bert_s2, mean_lda_s1, mean_lda_s2]
):
    sns.heatmap(sim, ax=ax, cmap='YlOrRd', vmin=0, vmax=1, cbar_kws={'label': 'Cosine Similarity'})
    ax.set_title(f'{title}\nMean inter-topic similarity: {mean:.4f}')
    ax.set_xlabel('Topic')
    ax.set_ylabel('Topic')

plt.tight_layout()
# plt.figsave()
plt.show()

# EVALUATION 4: Human vs. AI Topic Separation (Chi-Square Test)
    # Using Chi-Square test for independence
        
print("===Evaluation 4: Human vs. AI Topic Separation (Chi-Square Test)===")

def chi_square_topic_label(chunks_df, topic_col):
    """
    Run chi-square test of independence between topic assignment and Human/AI label. 
    Returns chi2 statistic, p-value, and contingency table.
    Excludes outlier topic -1.
    """
    df = chunks_df[chunks_df[topic_col] >= 0].copy()
    contingency = pd.crosstab(df['label'], df[topic_col])
    chi2, p, dof, expected = chi2_contingency(contingency)
    return chi2, p, dof, contingency

for label, chunks_df, topic_col in [
    ("BERTopic: S1 (Doc-Level)", chunks_s1, 'topic_bert'),
    ("BERTopic: S2 (200-Word)", chunks_s2, 'topic_bert'),
    ("LDA: S1 (Doc-Level)", chunks_s1, 'topic_lda'),
    ("LDA: S2 (200-Word)", chunks_s2, 'topic_lda'),
]:
    chi2, p, dof, contingency = chi_square_topic_label(chunks_df, topic_col)
    sig = "✓ SIGNIFICANT" if p < 0.05 else "✗ not significant"
    print(f"\n {label}")
    print(f"\t Chi-square: {chi2:.2f}  |  df: {dof}  |  p-value: {p:.2e}  →  {sig}")

# SUMMARY TABLE

print("Evaluation Summary Table")

_, p_bert_s1, _, _ = chi_square_topic_label(chunks_s1, 'topic_bert')
_, p_bert_s2, _, _ = chi_square_topic_label(chunks_s2, 'topic_bert')
_, p_lda_s1,  _, _ = chi_square_topic_label(chunks_s1, 'topic_lda')
_, p_lda_s2,  _, _ = chi_square_topic_label(chunks_s2, 'topic_lda')

summary_df = pd.DataFrame({
    'Model' : ['BERTopic', 'BERTopic', 'LDA', 'LDA'],
    'Strategy' : ['S1 (Doc)', 'S2 (200w)', 'S1 (Doc)', 'S2 (200w)'],
    'N Topics' : [14, 14, 15, 15],
    'Diversity' : [round(div_bert_s1, 3), round(div_bert_s2, 3), round(div_lda_s1,  3), round(div_lda_s2,  3)],
    'Avg Similarity' : [round(mean_bert_s1, 4), round(mean_bert_s2, 4), round(mean_lda_s1,  4), round(mean_lda_s2,  4)],
    'Chi-sq p-value' : [f"{p_bert_s1:.2e}", f"{p_bert_s2:.2e}", f"{p_lda_s1:.2e}",  f"{p_lda_s2:.2e}"],
    'Label Quality' : ['High (KeyBERT)', 'High (KeyBERT)', 'Medium (top words)', 'Medium (top words)']
})

print(summary_df.to_string(index=False))

===Evaluation 1: Topic Interpretability (BERTopic vs LDA)===

---Strategy 1 (Document-Level): BERTopic vs LDA---
Topic                                            BERTopic (KeyBERT)                                        LDA (Top Words)
   T0         education | organization | management | be | students    students | school | education | learning | children
   T1                     nature | characters | art | light | story                technology | human | world | time | new
   T2               restaurant | food | pizza | chicken | delicious      people | individuals | social | individual | life
   T3     sustainable | pollution | economic | driverless | climate         government | states | war | political | united
   T4 technology | technologies | communication | internet | social                use | ethical | law | privacy | animals
   T5                 nursing | nurses | patients | nurse | medical          cultural | women | people | culture | society
   T6        learning | features | recognition | datasets | dataset           health | care | patients | patient | medical
   T7                  sports | olympics | olympic | sport | soccer        college | electoral | states | vote | president
   T8           offenders | crime | justice | criminal | punishment                      like | just | time | people | don
   T9                              skin | hair | wear | wash | face                      year | 000 | games | said | years
  T10              mars | martian | geological | planets | landform                    car | cars | people | public | city
  T11          electoral | electors | voting | elections | election     company | business | market | products | marketing
  T12 antennas | antenna | wireless | communications | broadcasting                 research | data | used | study | based
  T13                       nfl | seahawks | steelers | afc | colts economic | energy | environmental | global | countries
  T14                                                             — management | employees | organization | project | work

---Strategy 2 (200-Word Windows): BERTopic vs LDA---
Topic                                              BERTopic (KeyBERT)                                            LDA (Top Words)
   T0                        art | culture | cultural | society | war                           time | year | said | years | day
   T1          technology | benefits | social | cars | transportation      information | process | patients | development | care
   T2               students | student | school | schools | education              media | social | introduction | paper | essay
   T3 management | managers | organization | organizations | business                         like | people | just | know | good
   T4                                  room | she | very | good | had          health | potential | use | energy | environmental
   T5                   ethical | genetic | animals | animal | rights    government | country | countries | economic | political
   T6                         cosmos | mars | nasa | space | universe                             women | water | al | et | body
   T7                   football | players | soccer | league | sports         students | school | education | learning | college
   T8         learning | networks | neural | recognition | algorithms                      human | time | face | language | life
   T9               electoral | electors | voter | elections | voters           world | social | cultural | individuals | people
  T10                              wash | using | face | clean | skin                      people | united | war | cars | states
  T11           lighting | light | photography | electricity | colors          company | market | business | services | products
  T12               refinery | oil | reservoirs | wastewater | fluids                  data | analysis | study | research | used
  T13                   eu | eus | european | europe | constitutional organization | employees | management | work | performance
  T14                                                               —                    web | new | 2011 | university | journal
===Evaluation 2: Topic Diversity===

BERTopic S1 (Doc-Level) diversity: 0.371
	BERTopic S2 (200-Word) diversity: 0.379
	LDA S1 (Doc-Level) diversity: 0.893
	LDA S2 (200-Word) diversity: 0.900
===Evaluation 3: Topic Distinctiveness (Cosine Similarity)===

Mean inter-topic similarity (lower = more distinct topics):
	BERTopic S1 (Doc-Level): 0.0000
	BERTopic S2 (200-Word): 0.0000
	LDA S1 (Doc-Level): 0.0101
	LDA S2 (200-Word): 0.0089

===Evaluation 4: Human vs. AI Topic Separation (Chi-Square Test)===

 BERTopic: S1 (Doc-Level)
	 Chi-square: 521.44  |  df: 13  |  p-value: 4.08e-103  →  ✓ SIGNIFICANT

 BERTopic: S2 (200-Word)
	 Chi-square: 2611.97  |  df: 13  |  p-value: 0.00e+00  →  ✓ SIGNIFICANT

 LDA: S1 (Doc-Level)
	 Chi-square: 870.13  |  df: 14  |  p-value: 1.08e-176  →  ✓ SIGNIFICANT

 LDA: S2 (200-Word)
	 Chi-square: 5887.17  |  df: 14  |  p-value: 0.00e+00  →  ✓ SIGNIFICANT
Evaluation Summary Table
   Model  Strategy  N Topics  Diversity  Avg Similarity Chi-sq p-value      Label Quality
BERTopic  S1 (Doc)        14      0.371          0.0000      4.08e-103     High (KeyBERT)
BERTopic S2 (200w)        14      0.379          0.0000       0.00e+00     High (KeyBERT)
     LDA  S1 (Doc)        15      0.893          0.0101      1.08e-176 Medium (top words)
     LDA S2 (200w)        15      0.900          0.0089       0.00e+00 Medium (top words)

INFO 230 Mini-Project 1: Human vs. AI Text (A Cultural Analytics Approach)¶

Project Structure (Following bCourses instructions):¶

Main Step 1: Load and Sample the Data¶

Sanity check: Is this a representative sample?¶

Main Step 2: Preliminary Corpus Statistics¶

Part 1: Sample-level Statistics¶

Part 2: Full dataset statistics (from distribution.csv)¶

Part 3: EDA Visualizations¶

What overall EDA now tells us:¶

Main Step 3: Chunking Strategies¶

Chunking Strategy Decision: Why Strategy 2?¶

Final Conclusion: Chunking Strategy 2 is better for topic modeling.¶

Main Step 4: Supervised Classification¶

Interactive Feature Importance Chart¶

Classification Results¶

TF-IDF Weighted Word Clouds: Human vs. AI¶

Word Cloud Interpretation:¶

Main Step 5: Topic Modeling (advanced option)¶

Main cultural question topic modeling will attempt to answer: do Human and AI texts cluster into meaningfully different topics, and does the answer change with different models or chunking strategies?¶

Model 1a: BERTopic on Chunking Strategy 1 (Document-Level Chunks)¶

Model 1b: BERTopic on Chunking Strategy 2 (200-Word Window Chunks)¶

Model 2a: LDA on Chunking Strategy 1 (Document-Level Chunks)¶

Model 2b: LDA on Chunking Strategy 2 (200-word window chunks)¶

Human vs. AI Topic Distribution: Interactive Heatmaps¶

Save the chunked files with topic assignments¶

Additional BERTopic Visualization¶

Topic Modeling Results: Discussion¶

Main Step 6: Evaluation ¶

Evaluation Results: Discussion¶

Main Step 7: Results, Discussion, Limitations, and Next Steps¶

Results Summary¶

Discussion¶

Limitations¶

Next Steps¶

	text	source	label	word_count
0	Oxygen gas (O 2) can be toxic at elevated part...	Human	0	105
1	The Tortoise and the Rabbit: A Story of Determ...	Mistral-7B	1	401
2	When we write words from another language, lik...	Text-Davinci-003	1	68