Population (millions) by age in top five US states by population by @trt

import pandas as pd
from novem import Plot
import inspect
import sys

# Constants
POPULATION_DIVISOR = 1e6
CENSUS_YEAR = 2020
CENSUS_API_BASE = "https://api.census.gov/data"

def census_codes(start: int, end: int) -> list[str]:
    """Return zero-padded census codes from start to end inclusive."""
    return [f"{x:03d}" for x in range(start, end + 1)]

def census_code_pair(male: tuple[int, int], female: tuple[int, int]) -> list[str]:
    """Expand male and female census ranges into combined codes."""
    return census_codes(*male) + census_codes(*female)

# Define age brackets (male+female ranges combined)
AGE_BRACKETS = [
    ("0-5",   census_code_pair((3, 3),  (27, 27))),
    ("5-14",  census_code_pair((4, 5),  (28, 29))),
    ("15-17", census_code_pair((6, 6),  (30, 30))),
    ("18-24", census_code_pair((7, 10), (31, 34))),
    ("25-44", census_code_pair((11, 14), (34, 38))),
    ("45-64", census_code_pair((15, 19), (39, 43))),
    ("65+",   census_code_pair((20, 25), (44, 49))),
]

AGE_ORDER = [label for label, _ in AGE_BRACKETS]

# Build mapping table: census codes → age brackets
code_map = pd.DataFrame(
    [{"bracket": label, "code": f"B01001_{code}E"}
     for label, codes in AGE_BRACKETS
     for code in codes]
)

# Build Census API URL
codes_str = ",".join(code_map["code"])
census_url = f"{CENSUS_API_BASE}/{CENSUS_YEAR}/acs/acs5?get={codes_str}&for=state:*"

# Fetch and reshape data
raw_df = pd.read_json(census_url)
raw_df.columns = raw_df.iloc[0]
raw_df = raw_df.drop(0).reset_index(drop=True)
raw_df = (
    raw_df.set_index("state")
          .stack()
          .reset_index(name="pop")
          .rename(columns={0: "code"})
)

# Merge with age brackets
merged_df = pd.merge(raw_df, code_map)

# Fetch state names from Wikipedia
state_table = pd.read_html(
    "https://en.wikipedia.org/wiki/Federal_Information_Processing_Standard_state_code"
)[0].rename(columns={"Numeric code": "FIPS"})

# Add FIPS integer column
merged_df["state_fips_int"] = pd.to_numeric(merged_df["state"], errors="coerce").astype("Int64")

# Merge with state names
full_data = merged_df.merge(
    state_table,
    left_on="state_fips_int",
    right_on="FIPS",
    how="left"
)

# Convert population to millions
full_data["pop"] = full_data["pop"].astype(float) / POPULATION_DIVISOR

# Make bracket categorical and ordered
full_data["bracket"] = pd.Categorical(
    full_data["bracket"],
    categories=AGE_ORDER,
    ordered=True
)

# Pivot: states as rows, brackets as columns
pivot_data = (
    full_data.pivot_table(
        index="Name", columns="bracket", values="pop", aggfunc="sum"
    )
    .assign(total=lambda df: df.sum(axis=1))
    .sort_values("total", ascending=False)
    .drop(columns="total")
)

# Select top 5 states by population
top_states = pivot_data.iloc[:5]

# Construct Novem plot
plot_caption = (
    "The 5 most populous states in the US. "
    "Data from the Census Bureau Data API, but not endorsed or certified by the Census Bureau. "
    "Calculations by novem."
)

barchart = Plot(
    "state_pop",
    type="gbar",
    name="Population (millions) by age in top five US states by population",
    caption=plot_caption
)

barchart.shared += 'public'

# Send data to Novem
top_states.pipe(barchart)

print(barchart.url)  # https://novem.no/p/qNGgN

# Include this python script in Description
barchart.description = (
    '```python\n' + inspect.getsource(sys.modules[__name__]) + '\n```'
)
Population (millions) by age in top five US states by population

Summary

Description