import plotly.io as pio
pio.renderers.default = "notebook"

import pandas as pd
import numpy as np
import networkx as nx
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from collections import Counter
import warnings
import os

warnings.filterwarnings("ignore")

print(f"NetworkX : {nx.__version__}")
print(f"Pandas : {pd.__version__}")
print(f"NumPy : {np.__version__}")

NetworkX : 3.6.1
Pandas : 3.0.0
NumPy : 2.3.5

# community detection is handled differently depending on your version of NetworkX
    # Louvain was added in v2.7 so if it's not available we fall back to greedy modularity
        # either way, the rest of the notebook uses whichever method is available
try:
    from networkx.algorithms.community import louvain_communities
    HAS_LOUVAIN = True
except ImportError:
    from networkx.algorithms.community import greedy_modularity_communities
    HAS_LOUVAIN = False

# tells us which method we're working with for the rest of the notebook
method_name = "Louvain" if HAS_LOUVAIN else "Greedy Modularity"
print(f"Community detection method: {method_name}")

Community detection method: Louvain

# Configuration 
    # all file paths and analysis thresholds are kept here in one place
    # so they are easy to adjust without hunting through the notebook
DATA_DIR = "../data"    
NET_OUT = "./network_output"
os.makedirs(NET_OUT, exist_ok=True)

# minimum number of shared comic appearances required to include an edge
    # set to 2 to filter out one-off coincidental appearances
MIN_EDGE_WEIGHT = 2

# maximum nodes to show in any network visualization
# keeping this low is intentional
    # plotting all 6,000+ heroes produces an unreadable hairball (hence the commented-out 150 below)
# TOP_N_NODES = 150    # BEWARE HAIRBALL
TOP_N_NODES = 50
SEED = 230    # random seed for reproducible layouts and community detection

# for figures
community_colors = [
    "#e63946","#457b9d","#2a9d8f","#e9c46a",
    "#f4a261","#264653","#a8dadc","#c77dff"
]
community_names = {
    0:  "Defenders and Mystic Heroes",
    1:  "X-Men",
    2:  "Fantastic Four",
    7:  "Thor and Cosmic Heroes",
    8:  "Avengers",
    13: "Spider-Mans World"
}

print("Configuration ready:")
print(f"\tData directory : {DATA_DIR}")
print(f"\tOutput directory : {NET_OUT}")
print(f"\tMin edge weight : {MIN_EDGE_WEIGHT}")
print(f"\tMax display nodes: {TOP_N_NODES}")
print(f"\tSeed : {SEED}")

Configuration ready:
	Data directory : ../data
	Output directory : ./network_output
	Min edge weight : 2
	Max display nodes: 50
	Seed : 230

# load all three raw files 
nodes = pd.read_csv("../data/nodes.csv")
edges = pd.read_csv("../data/edges.csv")
hero_net = pd.read_csv("../data/hero-network.csv")

# quick inspection 
print("nodes.csv :", nodes.shape)
print("edges.csv :", edges.shape)
print("hero-network :", hero_net.shape)

nodes.csv : (19090, 2)
edges.csv : (96104, 2)
hero-network : (574467, 2)

# count how many times each hero pair co-appears 
    # this becomes the edge weight 
# raw hero-network.csv lists each shared comic as a separate row 
    # so grouping and counting gives a proper weighted edge list 
hero_weighted = (
    hero_net.groupby(["hero1", "hero2"]).size().reset_index(name="weight")
)

print("Unique hero pairs (edges):", len(hero_weighted))
print("Weight distribution:")
print(hero_weighted["weight"].describe().round(2))

Unique hero pairs (edges): 224181
Weight distribution:
count    224181.00
mean          2.56
std           7.90
min           1.00
25%           1.00
50%           1.00
75%           2.00
max        1275.00
Name: weight, dtype: float64

# filter the data 
hero_filtered = hero_weighted[hero_weighted["weight"] >= MIN_EDGE_WEIGHT].copy()
print(f"Edges before filtering : {len(hero_weighted):,}")
print(f"Edges after filtering : {len(hero_filtered):,}")
print(f"Edges removed : {len(hero_weighted) - len(hero_filtered):,}")

Edges before filtering : 224,181
Edges after filtering : 85,352
Edges removed : 138,829

# names truncated at exactly 20 characters are a known encoding issue in this dataset
truncated = nodes[nodes["type"] == "hero"]["node"].str.len() == 20
print(f"Potentially truncated hero names: {truncated.sum()}")

Potentially truncated hero names: 1351

# quick validation pass across all three files before building any graphs
    # this confirms no null values slipped through and documents basic shape in a clean summary output 
print("=== nodes.csv ===")
print(f"\tShape: {nodes.shape}")
print(f"\tTypes: {nodes['type'].value_counts().to_dict()}")
print(f"\tNulls: {nodes.isnull().sum().to_dict()}")

print("\n=== edges.csv ===")
print(f"\tShape: {edges.shape}")
print(f"\tUnique heroes: {edges['hero'].nunique()}")
print(f"\tUnique comics: {edges['comic'].nunique()}")
print(f"\tNulls: {edges.isnull().sum().to_dict()}")

print("\n=== hero-network.csv (after weighting + filtering) ===")
print(f"\tRaw rows: {len(hero_net):,}")
print(f"\tUnique pairs: {len(hero_weighted):,}")
print(f"\tAfter filter: {len(hero_filtered):,}")
print(f"\tNulls: {hero_filtered.isnull().sum().to_dict()}")

=== nodes.csv ===
	Shape: (19090, 2)
	Types: {'comic': 12651, 'hero': 6439}
	Nulls: {'node': 0, 'type': 0}

=== edges.csv ===
	Shape: (96104, 2)
	Unique heroes: 6439
	Unique comics: 12651
	Nulls: {'hero': 0, 'comic': 0}

=== hero-network.csv (after weighting + filtering) ===
	Raw rows: 574,467
	Unique pairs: 224,181
	After filter: 85,352
	Nulls: {'hero1': 0, 'hero2': 0, 'weight': 0}

# build the weighted hero-hero graph from our filtered edge list
    # each edge carries a "weight" attribute = number of shared comic appearances
    # the higher the weight -> the more times that pair of heroes has co-appeared
G_weighted = nx.Graph() 

for _, row in hero_filtered.iterrows():
    G_weighted.add_edge(row["hero1"], row["hero2"], weight=row["weight"])

print("Weighted graph")
print(f"\tNodes: {G_weighted.number_of_nodes():,}")
print(f"\tEdges: {G_weighted.number_of_edges():,}")

Weighted graph
	Nodes: 4,465
	Edges: 62,128

# build the unweighted version by copying the weighted graph and setting all edge weights to 1
    # this lets us compare centrality rankings with and without frequency of co-appearance
    # same nodes and edges as G_weighted: only difference is every connection is treated as equal
G_unweighted = nx.Graph(G_weighted)
for u, v in G_unweighted.edges(): 
    G_unweighted[u][v]["weight"] = 1 

print("Unweighted graph")
print(f"\tNodes: {G_unweighted.number_of_nodes():,}")
print(f"\tEdges: {G_unweighted.number_of_edges():,}")

Unweighted graph
	Nodes: 4,465
	Edges: 62,128

# note that between-ness is an expensive computation on large graphs 
    # use a sample of 500 nodes to approximate (standard practice for networks this size)

# degree centrality: what fraction of all heroes does each hero co-appear with?
    # fast to compute, serves as our sanity check
print("Computing degree centrality...")
deg_weighted = nx.degree_centrality(G_weighted)
deg_unweighted = nx.degree_centrality(G_unweighted)

# betweenness centrality: which heroes sit on the shortest paths between other heroes?
    # this is expensive to compute exactly on a graph this size, so we approximate
    # using a random sample of 500 nodes (k=500): standard practice for large networks
print("Computing betweenness centrality (approximated, k=500)...")
btw_weighted = nx.betweenness_centrality(G_weighted, weight="weight", k=500, seed=SEED)
btw_unweighted = nx.betweenness_centrality(G_unweighted, k=500, seed=SEED)

# eigenvector centrality: how well-connected are a hero's connections?
    # max_iter=1000 gives the algorithm enough iterations to converge on this size graph
print("Computing eigenvector centrality...")
eig_weighted = nx.eigenvector_centrality(G_weighted, weight="weight", max_iter=1000)
eig_unweighted = nx.eigenvector_centrality(G_unweighted, max_iter=1000)

print("Done.")

Computing degree centrality...
Computing betweenness centrality (approximated, k=500)...
Computing eigenvector centrality...
Done.

# compile all centrality results into a single dataframe for easy comparison
    # each row is one hero, each column is a metric (weighted and unweighted versions)
    # adapted from the centrality metrics table pattern in the course Holmes co-occurrence notebook
heroes_list = list(G_weighted.nodes())

metrics_df = pd.DataFrame({
    "hero" : heroes_list,
    "degree_weighted" : [deg_weighted[h] for h in heroes_list],
    "degree_unweighted" : [deg_unweighted[h] for h in heroes_list],
    "betweenness_wtd" : [btw_weighted[h] for h in heroes_list],
    "betweenness_unwtd" : [btw_unweighted[h] for h in heroes_list],
    "eigenvector_wtd" : [eig_weighted[h] for h in heroes_list],
    "eigenvector_unwtd" : [eig_unweighted[h] for h in heroes_list],
    "raw_degree" : [G_weighted.degree(h) for h in heroes_list],
}).sort_values("degree_weighted", ascending=False).reset_index(drop=True)    # sort by weighted degree (most connected heroes at top)

# quick sanity check (if Captain America and Spider-Man aren't near the top something is wrong)
print("Top 10 by weighted degree:")
print(metrics_df[["hero","degree_weighted","betweenness_wtd","eigenvector_wtd"]].head(10))

Top 10 by weighted degree:
                   hero  degree_weighted  betweenness_wtd  eigenvector_wtd
0       CAPTAIN AMERICA         0.225806         0.060866         0.010815
1  SPIDER-MAN/PETER PAR         0.195789         0.051988         0.002985
2  IRON MAN/TONY STARK          0.170251         0.027778         0.006454
3      WOLVERINE/LOGAN          0.159722         0.033317         0.002875
4  THING/BENJAMIN J. GR         0.149642         0.024430         0.005661
5  THOR/DR. DONALD BLAK         0.148297         0.023400         0.005265
6  SCARLET WITCH/WANDA          0.145833         0.019366         0.006129
7  MR. FANTASTIC/REED R         0.142697         0.030277         0.005415
8  HUMAN TORCH/JOHNNY S         0.138441         0.019567         0.005508
9                  HAWK         0.133065         0.034386         0.004645

# run community detection on the weighted graph
    # Louvain groups heroes into communities by maximizing how densely connected
        # each group is relative to what you'd expect by chance
if HAS_LOUVAIN:
    communities = louvain_communities(G_weighted, weight="weight", seed=SEED)
    method_used = "Louvain"

# fallback if Louvain isn't available in this version of NetworkX
else:
    communities = greedy_modularity_communities(G_weighted, weight="weight")
    method_used = "Greedy Modularity"

# build a lookup dictionary: hero name -> community ID
    # (easy to color nodes and filter by community later)
community_map = {}
for i, comm in enumerate(communities):
    for hero in comm:
        community_map[hero] = i

# add community assignment as a column in our metrics dataframe
metrics_df["community"] = metrics_df["hero"].map(community_map)

print(f"Method: {method_used}")
print(f"Communities detected: {len(communities)}")

# the top 5 by size gives us a sense of how the network is structured
    # we expect a few large communities (major teams) and many small ones (minor characters)
print("\nTop 5 communities by size:")
comm_sizes = pd.Series([len(c) for c in communities]).sort_values(ascending=False)
print(comm_sizes.head())

Method: Louvain
Communities detected: 36

Top 5 communities by size:
1     901
8     697
13    659
0     418
2     408
dtype: int64

# build the hero-comic network directly from edges.csv
    # this is a two-sided network: heroes on one side, comic issues on the other
        # unlike the hero-hero graph, no projection has happened here yet (working with the raw co-appearance data)
B = nx.Graph()

heroes = edges["hero"].unique()
comics = edges["comic"].unique()

# the bipartite=0/1 tags are required by NetworkX to recognize the two sides
B.add_nodes_from(heroes, bipartite=0)    # heroes 
B.add_nodes_from(comics, bipartite=1)    # comics 

# each row in edges.csv is one hero appearing in one comic issue
for _, row in edges.iterrows(): 
    B.add_edge(row["hero"], row["comic"])

print("Hero-comic network:")
print(f"\tHero nodes: {len(heroes):,}")
print(f"\tComic nodes: {len(comics):,}")
print(f"\tEdges: {B.number_of_edges():,}")
print(f"Valid two-sided structure: {nx.is_bipartite(B)}")

Hero-comic network:
	Hero nodes: 6,439
	Comic nodes: 12,651
	Edges: 96,104
Valid two-sided structure: True

# extract the degree of every hero in the weighted graph
degrees = [d for _, d in G_weighted.degree()]

fig = make_subplots(rows=1, cols=2,
    subplot_titles=("Degree Distribution", "Log-Log Degree Distribution"))

# left: standard histogram showing the raw degree distribution
fig.add_trace(
    go.Histogram(x=degrees, nbinsx=100, marker_color="#e63946", name="Degree"),
    row=1, col=1
)

# right: log-log plot to check for scale-free behavior
    # if the points fall roughly on a straight line in log-log space, the network follows a power law
        # meaning a small number of heroes have vastly more connections than everyone else
degree_counts = Counter(degrees)
x_vals = sorted(degree_counts.keys())
y_vals = [degree_counts[x] for x in x_vals]

fig.add_trace(
    go.Scatter(x=x_vals, y=y_vals, mode="markers",
               marker=dict(color="#457b9d", size=4), name="Log-Log"),
    row=1, col=2
)

fig.update_xaxes(type="log", title_text="Degree (log)", row=1, col=2)
fig.update_yaxes(type="log", title_text="Count (log)", row=1, col=2)
fig.update_xaxes(title_text="Degree", row=1, col=1)
fig.update_yaxes(title_text="Count", row=1, col=1)

fig.update_layout(
    title_text="Hero Degree Distribution: Weighted Graph",
    showlegend=False,
    height=450,
    width=700
)
fig.show(renderer="notebook")

top_n = 15

# helper function that pulls the top N heroes for a given metric
    # and returns a side-by-side table of weighted vs unweighted rankings
def top_comparison(metric_wtd, metric_unwtd, label):
    wtd = metrics_df.nlargest(top_n, metric_wtd)[["hero", metric_wtd]].reset_index(drop=True)
    unwtd = metrics_df.nlargest(top_n, metric_unwtd)[["hero", metric_unwtd]].reset_index(drop=True)
    wtd.columns = ["Hero (weighted)", f"{label} (wtd)"]
    unwtd.columns = ["Hero (unweighted)", f"{label} (unwtd)"]
    return pd.concat([wtd, unwtd], axis=1)

print("=== Degree Centrality ===")
print(top_comparison("degree_weighted", "degree_unweighted", "Degree").to_string(index=False))

print("\n=== Betweenness Centrality ===")
print(top_comparison("betweenness_wtd", "betweenness_unwtd", "Betweenness").to_string(index=False))

print("\n=== Eigenvector Centrality ===")
print(top_comparison("eigenvector_wtd", "eigenvector_unwtd", "Eigenvector").to_string(index=False))

=== Degree Centrality ===
     Hero (weighted)  Degree (wtd)    Hero (unweighted)  Degree (unwtd)
     CAPTAIN AMERICA      0.225806      CAPTAIN AMERICA        0.225806
SPIDER-MAN/PETER PAR      0.195789 SPIDER-MAN/PETER PAR        0.195789
IRON MAN/TONY STARK       0.170251 IRON MAN/TONY STARK         0.170251
    WOLVERINE/LOGAN       0.159722     WOLVERINE/LOGAN         0.159722
THING/BENJAMIN J. GR      0.149642 THING/BENJAMIN J. GR        0.149642
THOR/DR. DONALD BLAK      0.148297 THOR/DR. DONALD BLAK        0.148297
SCARLET WITCH/WANDA       0.145833 SCARLET WITCH/WANDA         0.145833
MR. FANTASTIC/REED R      0.142697 MR. FANTASTIC/REED R        0.142697
HUMAN TORCH/JOHNNY S      0.138441 HUMAN TORCH/JOHNNY S        0.138441
                HAWK      0.133065                 HAWK        0.133065
BEAST/HENRY &HANK& P      0.133065 BEAST/HENRY &HANK& P        0.133065
             VISION       0.132841              VISION         0.132841
CYCLOPS/SCOTT SUMMER      0.129928 CYCLOPS/SCOTT SUMMER        0.129928
STORM/ORORO MUNROE S      0.129928 STORM/ORORO MUNROE S        0.129928
INVISIBLE WOMAN/SUE       0.127464 INVISIBLE WOMAN/SUE         0.127464

=== Betweenness Centrality ===
     Hero (weighted)  Betweenness (wtd)    Hero (unweighted)  Betweenness (unwtd)
     CAPTAIN AMERICA           0.060866 SPIDER-MAN/PETER PAR             0.088173
SPIDER-MAN/PETER PAR           0.051988      CAPTAIN AMERICA             0.084346
                HAWK           0.034386     WOLVERINE/LOGAN              0.046091
    WOLVERINE/LOGAN            0.033317 IRON MAN/TONY STARK              0.041897
MR. FANTASTIC/REED R           0.030277                 HAWK             0.037527
IRON MAN/TONY STARK            0.027778 THOR/DR. DONALD BLAK             0.036549
PUNISHER II/FRANK CA           0.027709 DAREDEVIL/MATT MURDO             0.031282
FURY, COL. NICHOLAS            0.027186 MR. FANTASTIC/REED R             0.030192
THING/BENJAMIN J. GR           0.024430 THING/BENJAMIN J. GR             0.029502
THOR/DR. DONALD BLAK           0.023400  HAVOK/ALEX SUMMERS              0.028371
 HAVOK/ALEX SUMMERS            0.023303 DR. STRANGE/STEPHEN              0.027734
DAREDEVIL/MATT MURDO           0.023110 PUNISHER II/FRANK CA             0.027376
DR. STRANGE/STEPHEN            0.023082 FURY, COL. NICHOLAS              0.026515
SUB-MARINER/NAMOR MA           0.022711 SHE-HULK/JENNIFER WA             0.025340
WATSON-PARKER, MARY            0.021304 SUB-MARINER/NAMOR MA             0.025129

=== Eigenvector Centrality ===
     Hero (weighted)  Eigenvector (wtd)    Hero (unweighted)  Eigenvector (unwtd)
   PATRIOT/JEFF MACE           0.783672      CAPTAIN AMERICA             0.135910
MISS AMERICA/MADELIN           0.619115     WOLVERINE/LOGAN              0.116683
HUMAN TORCH ANDROID/           0.031016 SCARLET WITCH/WANDA              0.115884
   DORMA [ATLANTEAN]           0.027687 IRON MAN/TONY STARK              0.114405
     CAPTAIN AMERICA           0.010815              VISION              0.113984
IRON MAN/TONY STARK            0.006454 SPIDER-MAN/PETER PAR             0.112280
WHIZZER/ROBERT L. FR           0.006394 THING/BENJAMIN J. GR             0.112253
             VISION            0.006241 BEAST/HENRY &HANK& P             0.110432
SCARLET WITCH/WANDA            0.006129 HUMAN TORCH/JOHNNY S             0.110242
THING/BENJAMIN J. GR           0.005661 MR. FANTASTIC/REED R             0.109859
HUMAN TORCH/JOHNNY S           0.005508 THOR/DR. DONALD BLAK             0.109238
WASP/JANET VAN DYNE            0.005437 STORM/ORORO MUNROE S             0.107589
MR. FANTASTIC/REED R           0.005415 WASP/JANET VAN DYNE              0.107101
INVISIBLE WOMAN/SUE            0.005329 CYCLOPS/SCOTT SUMMER             0.106120
THOR/DR. DONALD BLAK           0.005265 INVISIBLE WOMAN/SUE              0.105972

# bar chart showing the top 15 heroes by weighted degree, colored by their detected community
    # this lets us see at a glance whether the most connected heroes cluster into the same community
        # or whether they span multiple different teams

top15 = metrics_df.head(top_n).copy()
top15["community_str"] = top15["community"].map(community_names).fillna("Community " + top15["community"].astype(str))

fig = px.bar(
    top15, 
    x="hero",
    y="degree_weighted",
    color="community_str",
    title="Top 15 Heroes by Weighted Degree Centrality",
    labels={"hero":"Hero", "degree_weighted":"Degree Centrality (weighted)", "community_str": "Community"},
    height=500
)
fig.update_layout(xaxis_tickangle=45)
fig.show(renderer="notebook")

# count how many heroes belong to each community and grab the top 8 by size
top_communities = (
    metrics_df.groupby("community").size().reset_index(name="size").sort_values("size", ascending=False).head(8)
)

# map community IDs to readable names for the x-axis
top_communities["community_name"] = top_communities["community"].map(community_names).fillna("Community " + top_communities["community"].astype(str))

fig = px.bar(
    top_communities,
    x="community_name",
    y="size",
    title="Top 8 Communities by Size",
    labels={"community_name": "Community", "size": "Number of Heroes"},
    color="size",
    color_continuous_scale="Blues",
    height=400
)
fig.update_layout(xaxis_tickangle=45)
fig.show(renderer="notebook")

# print the top 5 heroes per community by weighted degree
    # this helps us interpret what each community actually represents and verify whether the algorithm recovered real Marvel teams
print("Top 5 heroes per community (by weighted degree): \n")
for comm_id in top_communities["community"].values:
    heroes_in_comm = (
        metrics_df[metrics_df["community"] == comm_id]
        .nlargest(5, "degree_weighted")["hero"]
        .tolist()
    )

    # use the readable community name if we have one, otherwise fall back to the ID
    comm_label = community_names.get(comm_id, f"Community {comm_id}")
    print(f"\t{comm_label}: {', '.join(heroes_in_comm)}")

Top 5 heroes per community (by weighted degree): 

	X-Men: WOLVERINE/LOGAN , BEAST/HENRY &HANK& P, CYCLOPS/SCOTT SUMMER, STORM/ORORO MUNROE S, COLOSSUS II/PETER RA
	Avengers: CAPTAIN AMERICA, IRON MAN/TONY STARK , SCARLET WITCH/WANDA , HAWK, VISION 
	Spider-Mans World: SPIDER-MAN/PETER PAR, DAREDEVIL/MATT MURDO, JAMESON, J. JONAH, WATSON-PARKER, MARY , ROBERTSON, JOE
	Defenders and Mystic Heroes: HULK/DR. ROBERT BRUC, DR. STRANGE/STEPHEN , JONES, RICHARD MILHO, NORRISS, SISTER BARB, HELLCAT/PATSY WALKER
	Fantastic Four: THING/BENJAMIN J. GR, MR. FANTASTIC/REED R, HUMAN TORCH/JOHNNY S, INVISIBLE WOMAN/SUE , SILVER SURFER/NORRIN
	Thor and Cosmic Heroes: THOR/DR. DONALD BLAK, THUNDERSTRIKE/ERIC K, ODIN [ASGARDIAN], LOKI [ASGARDIAN], BALDER [ASGARDIAN]
	Community 11: BEETLE/ABNER RONALD , MOONSTONE II/KARLA S, SCREAMING MIMI/MELIS, POWER MAN/ERIK JOSTE, CITIZEN V II/HELMUT 
	Community 5: NOVA/RICHARD RIDER, FIRESTAR/ANGELICA JO, NAMORITA/NITA PRENTI, SPEEDBALL/ROBBIE BAL, JUSTICE II/VANCE AST

# look at the network from the comics side: which issues brought the most heroes together?
    # this is the editorial production angle: a comic with many heroes was a deliberate crossover or team event, not just a solo issue
comic_degree = {n: d for n, d in B.degree() if n in set(comics)}

# grab the top 15 comics by number of heroes featured
top_comics = pd.DataFrame(
    sorted(comic_degree.items(), key=lambda x: -x[1])[:15],
    columns=["comic", "hero_count"]
)

# the dataset uses internal comic abbreviations — we decode the known ones here
comic_labels = {
    "COC 1"    : "COC 1 (Contest of Champions)",
    "IW 1"     : "IW 1 (Infinity War)",
    "IW 2"     : "IW 2 (Infinity War)",
    "IW 3"     : "IW 3 (Infinity War)",
    "IW 4"     : "IW 4 (Infinity War)",
    "IW 6"     : "IW 6 (Infinity War)",
    "H2 279"   : "H2 279 (Hulk)",
    "FF 368"   : "FF 368 (Fantastic Four)",
    "FF 369"   : "FF 369 (Fantastic Four)",
    "FF 370"   : "FF 370 (Fantastic Four)",
    "FF 3"     : "FF 3 (Fantastic Four)",
    "TB 25"    : "TB 25 (Thunderbolts)",
    "A3 1"     : "A3 1 (Avengers)",
    "MAXSEC 3" : "MAXSEC 3 (Maximum Security)",
    "M/GN 1"   : "M/GN 1 (Marvel graphic novel)",
}

top_comics["comic_label"] = top_comics["comic"].map(comic_labels).fillna(top_comics["comic"])

fig = px.bar(
    top_comics,
    x="comic",           # keep the short code on the x-axis
    y="hero_count",
    title="Top 15 Comics by Number of Heroes Featured",
    labels={"comic": "Comic Issue", "hero_count": "Number of Heroes"},
    hover_data={"comic_label": True, "comic": False, "hero_count": True},
    height=500
)
fig.update_layout(xaxis_tickangle=45)
fig.show(renderer="notebook")

# flip the question around: instead of which comics had the most heroes,
    # which heroes showed up across the most comic issues?
hero_degree_B = {n: d for n, d in B.degree() if n in set(heroes)}

# grab the top 15 heroes by number of issues appeared in
top_hero_comics = pd.DataFrame(
    sorted(hero_degree_B.items(), key=lambda x: -x[1])[:15],
    columns=["hero", "issue_count"]
)

fig = px.bar(
    top_hero_comics,
    x="hero",
    y="issue_count",
    title="Top 15 Heroes by Number of Comic Issues Appeared In",
    labels={"hero": "Hero", "issue_count": "Number of Issues"},
    color="issue_count",
    color_continuous_scale="Reds",
    height=450
)
fig.update_layout(xaxis_tickangle=45)
fig.show(renderer="notebook")

# filter the full graph down to the top N heroes by degree for visualization
    # plotting all 4,000+ heroes at once produces an unreadable hairball
# TOP_N_NODES is set in the config cell at the top of the notebook
top_nodes = metrics_df.head(TOP_N_NODES)["hero"].tolist()
G_sub = G_weighted.subgraph(top_nodes).copy()

print(f"Subgraph nodes: {G_sub.number_of_nodes()}")
print(f"Subgraph edges: {G_sub.number_of_edges()}")

Subgraph nodes: 50
Subgraph edges: 1171

# static inline visualization of the top 50 heroes by degree
    # node color = community, node size = degree
        # this is a preview only (the full interactive per-community versions are in the figures folder)

# compute a spring layout: nodes with stronger ties are pulled closer together
pos = nx.spring_layout(G_sub, weight="weight", seed=SEED)

# color each node by its community and scale size by degree
node_colors = [community_colors[community_map.get(n, 0) % len(community_colors)] for n in G_sub.nodes()]
node_sizes  = [5 + G_weighted.degree(n) * 0.08 for n in G_sub.nodes()]

# build edge traces: each edge needs a start point, end point, and None to break the line
edge_x, edge_y = [], []
for u, v in G_sub.edges():
    x0, y0 = pos[u]
    x1, y1 = pos[v]
    edge_x += [x0, x1, None]
    edge_y += [y0, y1, None]

node_x = [pos[n][0] for n in G_sub.nodes()]
node_y = [pos[n][1] for n in G_sub.nodes()]

# use readable community names in the hover tooltip instead of raw IDs
def get_community_label(n):
    comm_id = community_map.get(n)
    return community_names.get(comm_id, f"Community {comm_id}")

node_text = [
    f"{n}<br>Degree: {G_weighted.degree(n)}<br>Community: {get_community_label(n)}"
    for n in G_sub.nodes()
]

fig = go.Figure()

# add edges as a faint line trace
fig.add_trace(go.Scatter(
    x=edge_x, y=edge_y,
    mode="lines",
    line=dict(width=0.5, color="rgba(255,255,255,0.15)"),
    hoverinfo="none"
))

# add nodes as a scatter trace with labels and hover info
fig.add_trace(go.Scatter(
    x=node_x, y=node_y,
    mode="markers+text",
    marker=dict(
        size=node_sizes,
        color=node_colors,
        line=dict(width=0.5, color="white")
    ),
    text=list(G_sub.nodes()),
    textposition="top center",
    textfont=dict(size=7, color="white"),
    hovertext=node_text,
    hoverinfo="text"
))

fig.update_layout(
    title="Top 50 Heroes by Degree — Colored by Community",
    paper_bgcolor="#1a1a2e",
    plot_bgcolor="#1a1a2e",
    font=dict(color="white"),
    showlegend=False,
    height=700,
    xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
    yaxis=dict(showgrid=False, zeroline=False, showticklabels=False)
)

fig.show(renderer="notebook")

# export individual community subgraphs for Cytoscape
    # much cleaner than the full graph (one team at a time)
for comm_id, comm_name in community_names.items():
    comm_nodes = (
        metrics_df[metrics_df["community"] == comm_id]
        .nlargest(20, "degree_weighted")["hero"]  # drop from 40 to 20
        .tolist()
    )
    G_comm_export = G_weighted.subgraph(comm_nodes).copy()
    
    # only keep edges with weight >= 5 to show strongest relationships only
    edges_to_remove = [(u, v) for u, v, d in G_comm_export.edges(data=True) if d.get("weight", 0) < 5]
    G_comm_export.remove_edges_from(edges_to_remove)
        
    # tag nodes before export
    for node in G_comm_export.nodes():
        G_comm_export.nodes[node]["degree"] = int(G_weighted.degree(node))
        G_comm_export.nodes[node]["community"] = int(community_map.get(node, -1))
    
    export_path = os.path.join(NET_OUT, f"marvel_{comm_name.replace(' ', '_').replace('&', 'and').replace(chr(39), '')}.graphml")
    nx.write_graphml(G_comm_export, export_path)
    print(f"Exported: {export_path}")

Exported: ./network_output/marvel_Defenders_and_Mystic_Heroes.graphml
Exported: ./network_output/marvel_X-Men.graphml
Exported: ./network_output/marvel_Fantastic_Four.graphml
Exported: ./network_output/marvel_Thor_and_Cosmic_Heroes.graphml
Exported: ./network_output/marvel_Avengers.graphml
Exported: ./network_output/marvel_Spider-Mans_World.graphml

# generate interactive pyvis HTML for the top 4 communities by size
    # each file shows the top 40 heroes within that community
        # saved to the figures folder and linked from the GitHub Pages index

from pyvis.network import Network

FIGURES_DIR = "../figures"
os.makedirs(FIGURES_DIR, exist_ok=True)

# get the top 4 community IDs by size
top_comm_ids = (
    metrics_df.groupby("community").size().sort_values(ascending=False).head(4).index.tolist())

for COMMUNITY_ID in top_comm_ids:

    # filter to top 40 heroes in this community by weighted degree
    community_nodes_filtered = (
        metrics_df[metrics_df["community"] == COMMUNITY_ID].nlargest(40, "degree_weighted")["hero"].tolist())

    G_comm = G_weighted.subgraph(community_nodes_filtered).copy()
    print(f"Community {COMMUNITY_ID} — {G_comm.number_of_nodes()} nodes, {G_comm.number_of_edges()} edges")

    nt = Network(height="700px", width="100%", bgcolor="#1a1a2e", font_color="white", notebook=False)

    for node in G_comm.nodes():
        deg  = G_weighted.degree(node)
        comm = community_map.get(node, 0) % len(community_colors)
        nt.add_node(node, label=node, color=community_colors[comm], size=min(5 + deg * 0.08, 30), title=f"{node}<br>Degree: {deg}<br>Community: {community_names.get(COMMUNITY_ID, str(COMMUNITY_ID))}")

    for u, v, data in G_comm.edges(data=True):
        nt.add_edge(u, v, value=data.get("weight", 1))

    nt.set_options("""
{
  "physics": {
    "stabilization": {"enabled": true, "iterations": 500},
    "forceAtlas2Based": {
      "gravitationalConstant": -120,
      "springLength": 200,
      "springConstant": 0.05
    },
    "solver": "forceAtlas2Based",
    "minVelocity": 0.75
  }
}
""")

    filename = os.path.join(FIGURES_DIR, f"interactive_community_{COMMUNITY_ID}.html")
    nt.save_graph(filename)

    # inject a readable title and fixed header into the saved HTML
    with open(filename, "r") as f:
        html = f.read()

    comm_label = community_names.get(COMMUNITY_ID, f"Community {COMMUNITY_ID}")
    html = html.replace(
        "<head>",
        f"<head><title>{comm_label} — Marvel Network</title>"
        f"<style>body::before {{content: '{comm_label} — Top 40 Heroes'; "
        f"position: fixed; top: 10px; left: 50%; transform: translateX(-50%); "
        f"color: white; font-family: Arial, sans-serif; font-size: 18px; "
        f"font-weight: bold; z-index: 9999;}}</style>"
    )
    with open(filename, "w") as f:
        f.write(html)

    print(f"\t Saved to {filename}")

Community 1 — 40 nodes, 721 edges
	 Saved to ../figures/interactive_community_1.html
Community 8 — 40 nodes, 687 edges
	 Saved to ../figures/interactive_community_8.html
Community 13 — 40 nodes, 573 edges
	 Saved to ../figures/interactive_community_13.html
Community 0 — 40 nodes, 278 edges
	 Saved to ../figures/interactive_community_0.html

1. Motivation¶

General Setup¶

Load in the data¶

2. Dataset Development¶

Development Step 1: Build a weighted edge list from scratch¶

Development Step 2: Apply a minimum edge weight threshold¶

Development Step 3: Flag truncated hero names¶

Development Step 4: Validation of all datasets that they are good to proceed with¶

3. Analytical Approach & Metric Justification¶

Part 3.1: Build the weighted hero-hero graph¶

Part 3.2: Build the unweighted version for comparison¶

Part 3.3: Compute centrality metrics on both graphs¶

Part 3.4: Compile into a metrics dataframe¶

Part 3.5: Community detection¶

Part 3.6: Build the hero-comic network from edges.csv¶

4. Results & Visualizations¶

Part 4.1: Degree Distribution¶

4.2: Centrality Rankings: Weighted vs. Unweighted¶

Part 4.3: Community Detection Results¶

Part 4.4 The Comics Side of the Network¶

Part 4.5: Cytoscape Export and PyVis Interactive Network Visualizations¶

Part 4.5 continued: Interactive Community Visualizations¶

5. Discussion¶

6. Limitations ¶

7. Further Study¶

INFO 230 Mini-Project 2: Marvel Universe Social Network¶

1. Motivation¶

General Setup¶

Load in the data¶

2. Dataset Development¶

Development Step 1: Build a weighted edge list from scratch¶

Development Step 2: Apply a minimum edge weight threshold¶

Development Step 3: Flag truncated hero names¶

Development Step 4: Validation of all datasets that they are good to proceed with¶

3. Analytical Approach & Metric Justification¶

Part 3.1: Build the weighted hero-hero graph¶

Part 3.2: Build the unweighted version for comparison¶

Part 3.3: Compute centrality metrics on both graphs¶

Part 3.4: Compile into a metrics dataframe¶

Part 3.5: Community detection¶

Part 3.6: Build the hero-comic network from edges.csv¶

4. Results & Visualizations¶

Part 4.1: Degree Distribution¶

4.2: Centrality Rankings: Weighted vs. Unweighted¶

Part 4.3: Community Detection Results¶

Part 4.4 The Comics Side of the Network¶

Part 4.5: Cytoscape Export and PyVis Interactive Network Visualizations¶

Part 4.5 continued: Interactive Community Visualizations¶

5. Discussion¶

6. Limitations ¶

7. Further Study¶