Back to Superset

Generate GeoJSON from Natural Earth Data

superset-frontend/plugins/legacy-plugin-chart-country-map/scripts/Country Map GeoJSON Generator.ipynb

2021.41.040.0 KB
Original Source

Generate GeoJSON from Natural Earth Data

Install Dependencies

pip install geopandas shapely matplotlib

Download Data

Download datasets (Admin 0 - Countries in 1:10, and Admin 1 – States, Provinces in 1:10 and 1:50) from Natural Earch Data:

python
# Dependencies

import os
import json
import requests
import geopandas as gpd
import matplotlib.pyplot as plt
import shapely
import pandas as pd
import shapely.geometry
import shapely.ops
import shapely.affinity
from shapely.geometry import Polygon, MultiPolygon
import shutil
python
speed_run = False
# set this to True if you want to skip all the rendering of previews in this notebook and just get an update of GeoJSON/TS/JSON files.
python
data_dir = os.path.expanduser("~/Downloads")
if not os.path.exists(data_dir):
    os.mkdir(data_dir)

def download_files(skip_existing: bool):
    for url in [
        "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_0_countries.zip",
        "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/10m/cultural/ne_10m_admin_1_states_provinces.zip",
        "https://www.naturalearthdata.com/http//www.naturalearthdata.com/download/50m/cultural/ne_50m_admin_1_states_provinces.zip"
    ]:
        file_name = url.split('/')[-1]
        full_file_name = f'{data_dir}/{file_name}'
        # temporary fix
        url = url.replace("https://www.naturalearthdata.com/http//www.naturalearthdata.com/download", "https://naciscdn.org/naturalearth")
        with requests.get(
            url,
            headers={
                "accept-encoding": "gzip, deflate, br",
                "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36"
            },
            stream=True,
        ) as res:
            file_size = int(res.headers['content-length'])
            if res.status_code != 200:
                print("Error downloading files. Please open the URL to download them from browser manually.")
                break
            if (
                skip_existing and
                os.path.exists(full_file_name) and
                file_size == os.path.getsize(full_file_name)
            ):
                print(f"Skip {file_name} because it already exists")
                continue
            print(f"Downloading {file_name}... \r", end="")
            with open(full_file_name, "wb") as fh:
                fh.write(res.content)
    print("Done.                                                            ")

download_files(skip_existing=False)
python
# Read Natural Earth data files into GeoDataFrames
df_admin0_10m = gpd.read_file(f"{data_dir}/ne_10m_admin_0_countries.zip")
df_10m = gpd.read_file(f"{data_dir}/ne_10m_admin_1_states_provinces.zip")
df_50m = gpd.read_file(f"{data_dir}/ne_50m_admin_1_states_provinces.zip")

# Convert column names to lowercase
df_admin0_10m.columns = df_admin0_10m.columns.str.lower()
python
df_50m.groupby('admin').count()
python
# Use 1:50m geometry for some large countries:

print(*df_50m['admin'].unique(), sep='\n')
python
df = pd.concat([df_10m[~df_10m['admin'].isin(df_50m['admin'].unique())], df_50m])
rdf = df[(df['admin'] == 'Saint Pierre and Miquelon')]
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
rdf

Adjust the Maps

<span style="color: red; font-size: 1.5em">TO SUPPORT NEW COUNTRIES, ADD COUNTRY NAME BELOW</span>

python
# Country names used in file names
countries = [
  'afghanistan',
  'aland',
  'albania',
  'algeria',
  'american samoa',
  'andorra',
  'angola',
  'anguilla',    
  'antarctica',
  'antigua and barbuda',
  'argentina',
  'armenia',
  'aruba',
  'australia',
  'austria',
  'azerbaijan',
  'the bahamas',
  'bahrain',
  'bangladesh',
  'barbados',
  'belarus',
  'belgium',
  'belize',
  'benin',
  'bermuda',
  'bhutan',
  'bolivia',
  # 'bonaire, sint eustatius and saba', # Part of Netherlands Antilles, part of Netherlands, according to Natural Earth?
  'bosnia and herzegovina',
  'botswana',
  # 'bouvet island', # part of Norway, in Natural Earth data
  'brazil',
  'brunei',
  'british indian ocean territory',
  'bulgaria',
  'burkina faso',
  'burundi',
  'cape verde',
  'cambodia',
  'cameroon',
  'canada',
  'cayman islands',
  'central african republic',
  'chad',
  'chile',
  'china',
  # 'christmas island', # part of British Indian Ocean Territory, according to Natural Earth
  # 'cocos (keeling) islands', # part of British Indian Ocean Territory, according to Natural Earth
  'colombia',
  'comoros',
  'democratic republic of the congo',
  'cook islands',
  'costa rica',
  'croatia',
  'cuba',
  'curaçao',
  'cyprus',
  'czech republic',
  'denmark',
  'djibouti',
  'dominica',
  'dominican republic',
  'ecuador',
  'egypt',
  'el salvador',
  'equatorial guinea',
  'eritrea',
  'estonia',
  # 'eswatini', # not sure why this doesn't work — Swaziland isn't available to alias, either.
  'ethiopia',
  'falkland islands',
  'faroe islands',
  'fiji',
  'finland',
  'france',
  # 'french guiana',
  'french polynesia',
  #'french southern and antarctic lands',
  'gabon', 
  'gambia',
  'germany',
  'ghana',
  'gibraltar',
  'greece',
  'greenland',
  'grenada',
  # 'guadeloupe', # part of France, in Natural Earth data
  'guam',
  'guatemala',
  'haiti',
  'hungary',
  'guernsey',
  'guinea',
  # 'guinea-bissau', # not sure why this isn't working
  'guyana',
  'honduras',
  'iceland',
  'india',
  'indonesia',
  'iran',
  'israel',
  'italy',
  'ivory coast',
  'japan',
  'jordan',
  'kazakhstan',
  'kenya',
  'korea',
  'kuwait',
  'kyrgyzstan',
  'laos',
  'latvia',
  'lebanon',
  'lesotho',
  'liberia',
  'libya',
  'liechtenstein',
  'lithuania',
  'luxembourg',
  # 'macao', # part of China, in Natural Earth data
  'macedonia',
  'madagascar',
  'malawi',
  'malaysia',
  'maldives',
  'mali',
  'malta',
  'marshall islands',
  # 'martinique', # part of France, in Natural Earth data
  'mauritania',
  'mauritius',
  # 'mayotte', # part of France, in Natural Earth data
  'mexico',
  'moldova',
  'montserrat',
  'monaco',
  'mongolia',
  'montenegro',
  'morocco',
  'mozambique',
  'myanmar',
  'namibia',
  'nauru',
  'nepal',
  'netherlands',
  'new caledonia',
  'new zealand',
  'nicaragua',
  'niger',
  'nigeria',
  'niue',
  'norfolk island',
  'northern mariana islands',
  'norway',
  'oman',
  'pakistan',
  'palau',
  # 'palestine', # part of Israel, in Natural Earth data
  'panama',
  'papua new guinea',
  'paraguay',
  'peru',
  # 'pitcairn', # part of UK, in Natural Earth data
  'philippines',
  'poland',
  'portugal',
  'puerto rico',
  'qatar',
  # 'réunion', # part of France, in Natural Earth data
  'republic of serbia',
  'romania',
  'russia',
  'rwanda',
  'saint barthelemy',
  # 'saint helena, ascension and tristan da cunha', # part of UK, in Natural Earth data
  'saint lucia',
  'saint martin',
  #'saint martin (french part)', part of Saint Martin, in Natural Earth data
  'saint pierre and miquelon',
  'saint vincent and the grenadines',
  'samoa',
  'san marino',
  'sao tome and principe',
  'saudi arabia',
  'senegal',
  'seychelles',
  'sierra leone',
  'singapore',
  # 'sint maarten (dutch part)', part of Saint Martin, in Natural Earth data
  'slovakia',
  'slovenia',
  'solomon islands',
  'somalia',
  'south africa',
  # 'south sudan', # part of Sudan, in Natural Earth data
  'spain',
  'sri lanka',
  'sudan',
  'suriname',
  # 'svalbard and jan mayen', # part of Norway, in Natural Earth data
  'sweden',
  'switzerland',
  'syria',
  'taiwan',
  'tajikistan',
  'tanzania',
  'thailand',
  'timorleste',
  'togo',
  # 'tokelau', # part of New Zealand, in Natural Earth data
  'tonga',
  'trinidad and tobago',
  'tunisia',
  'turkey',
  'turkmenistan',
  'turks and caicos islands',
  'tuvalu',
  'uganda',
  'uk',
  'ukraine',
  'united arab emirates',
  'united states minor outlying islands',
  'uruguay',
  'usa',
  'uzbekistan',
  'vanuatu',
  'vatican',
  'venezuela',
  'vietnam',
  'british virgin islands',
  'united states virgin islands',
  'wallis and futuna',
  'yemen',
  'zambia',
  'zimbabwe'
]

# country name used in dataset
country_name_aliases = {
    "korea": "south korea",
    "uk": "united kingdom",
    "usa": "united states of america",
    "timorleste": "east timor",
    "tanzania": "united republic of tanzania",
}

# CSV files that are defined later in the notebook:
region_maps = [
  'france_regions',
  'france_overseas',
  'italy_regions',
  'philippines_regions',
  'turkey_regions'
]

# Make sure all country names are covered:
invalid_countries = [x for x in countries if (country_name_aliases.get(x, x) not in df["admin"].str.lower().unique()) and (x not in region_maps)]

if invalid_countries:
  print(f"Following country names are not valid: {invalid_countries}")

Preview all countries:

python
# Plotting style defaults:
plot_styles = {
    'edgecolor': 'black',  # Sets the color of the border of each geometry in the plot to black
    'column': 'name',      # Specifies the column to be used for coloring the geometries based on its values
    'legend': False,       # Disables the legend for the plot
    'cmap': 'tab20',       # Sets the colormap to 'tab20' which provides a distinct set of colors for visual differentiation
    'linewidth': 0.25       # Sets the thickness of the edges/borders of the geometries
}
python
def get_gdf(country):
    country_alias = country_name_aliases.get(country, country)
    if alt_maps and country in alt_maps:
        gdf = alt_maps[country]
    else:
        gdf = df[df["admin"].str.lower() == country_alias]
    return gdf.copy()

def plot_all_countries(countries, subplot_width=5, subplot_height=5, base_tolerance=0.01):
    if not countries:
        print("No countries to plot.")
        return

    num_countries = len(countries)
    ncols = 6  # Set fixed number of columns
    nrows = max((num_countries + ncols - 1) // ncols, 1)  # Ensure at least one row

    figsize = (ncols * subplot_width, nrows * subplot_height)
    print(f"Debug Info: figsize={figsize}, ncols={ncols}, nrows={nrows}, num_countries={num_countries}")

    plt.figure(figsize=figsize)
    plt.rc('font', size=24)  # Sets the font size globally

    for i, country in enumerate(countries):
        ax = plt.subplot(nrows, ncols, i + 1)
        gdf = get_gdf(country)
        if not gdf.empty:
            gdf_projected = gdf.to_crs(epsg=6933)
            area = gdf_projected['geometry'].area.sum()
            dynamic_tolerance = base_tolerance * (area / 1e6) ** 0.8  
            gdf_projected['geometry'] = gdf_projected['geometry'].simplify(tolerance=dynamic_tolerance, preserve_topology=True)

            gdf.plot(ax=ax, **plot_styles)
            ax.set_aspect('equal', adjustable='datalim')
        else:
            ax.text(0.5, 0.5, country, ha='center', va='center', fontsize=24)
        ax.set_title(country)

    plt.tight_layout()
    plt.show()
    plt.rcdefaults()


# If you want to see a preview of all countries before they're touched up, just switch this to True:
# It's disabled because it takes a while to run.
plot_preview = False
if plot_preview:
 plot_all_countries()

Handle countries with flying islands

  • For countries with flying islands, we need to move the islands closer to the mainland.
  • TODO: There are several countries that could use some design touchup, including
    • American Samoa
    • Cook Islands
    • Fiji
    • French Polynesia
    • Maldives
    • Marshall Islands
    • Mauritius
    • New Zealand
    • Palau
    • Portugal
    • Seychelles
    • Tonga
    • United States Minor Outlying Islands

USA

python
usa = df[df['adm0_a3'] == 'USA']
not speed_run and usa.plot(**plot_styles)
python
def reposition(df, idx, xoff=None, yoff=None, xscale=None, yscale=None, simplify=None):

    def move_and_scale(series):
        if xoff or yoff:
            series = shapely.affinity.translate(series, xoff or 0, yoff or 0)
        if xscale or yscale:
            series = shapely.affinity.scale(series, xscale or 1, yscale or 1)
        if simplify:
            series = series.simplify(simplify, preserve_topology=False)
        return series

    df.loc[idx, 'geometry'] = df.loc[idx, 'geometry'].apply(move_and_scale)


usa_copy = usa.copy()
reposition(usa_copy, usa.name == 'Hawaii', 51, 5.5)
reposition(usa_copy, usa.name == 'Alaska', 35, -34, 0.35, 0.35)

not speed_run and usa_copy.plot(figsize=(8,8), **plot_styles)

China

China claims sovereign over Taiwan. For disputed territories, we respect each country and give them what they want.

In addition, Hong Kong and Macau should also be included in a China map.

python
# Chinese Special Administrative Regions
china_sars = df_admin0_10m.loc[
    df_admin0_10m.name_en.isin(['Taiwan', 'Hong Kong', 'Macau']),
    [x for x in df_admin0_10m.columns if x in df.columns]
]
china_sars = china_sars.merge(pd.DataFrame(
    data={
        "name_en": ["Taiwan", "Hong Kong", "Macau"],
        "name_zh": ["中国台湾", "香港特别行政区", "澳门特别行政区"],
        "iso_3166_2": ["CN-71", "CN-91", "CN-92"],
    },
), on="name_en", how="left")
china_sars
python
china = df[df.admin == "China"]
china_copy = pd.concat([china, china_sars], ignore_index=True)

# Combine the 'name_zh' columns
china_copy["name_zh"] = china_copy["name_zh"].combine_first(china_copy["name_zh_y"])

# Drop the extra 'name_zh_x' and 'name_zh_y' columns, if they exist
china_copy = china_copy.drop(["name_zh_x", "name_zh_y"], axis=1)

# Plotting the DataFrame
not speed_run and china_copy.plot(**plot_styles)

Note ISO-3166-2:CN has updated subdivisions to use letters instead of numbers (e.g. CN-91 -> CN-HK). We kept the numeric code for backward compatibility.

Finland

  • The Åland Islands (ISO country code AX) is an autonomous region of Finland, and carries the ISO-3166 code FI-01.
python
finland_aland = df_admin0_10m.loc[
    df_admin0_10m.name_en.isin(['Åland']),
    [x for x in df_admin0_10m.columns if x in df.columns]
]
finland_aland = finland_aland.merge(pd.DataFrame(
    data={
        "name_en": ["Åland"],
        "name_fi": ["Ahvenanmaan maakunta"],
        "iso_3166_2": ["FI-01"],
    },
), on="name_en", how="left")

python
finland = df[df.admin == "Finland"]

# Concatenate the 'finland' DataFrame with 'finland_aland' DataFrame
finland_copy = pd.concat([finland, finland_aland], ignore_index=True)

# Combine 'name_fi' columns. However, since both columns are named 'name_fi', this might be redundant
# If you have two different columns for 'name_fi' values in each DataFrame, specify them as 'name_fi_x' and 'name_fi_y'
finland_copy["name_fi"] = finland_copy["name_fi"].combine_first(finland_copy["name_fi"])

# Drop the 'name_fi' column, if that's intended. This will remove the 'name_fi' data entirely.
# If you meant to drop other columns (like 'name_fi_x' and 'name_fi_y'), update the column names accordingly
finland_copy = finland_copy.drop(["name_fi"], axis=1)

# Plotting the DataFrame
not speed_run and finland_copy.plot(figsize=(7, 7), **plot_styles)

Ukraine

  • Crimea, carrying ISO-3166 code UA-43, belongs to Ukraine (ISO country code UA)
python

russia_copy = df[df['adm0_a3'] == 'RUS'].copy()
crimea = russia_copy[russia_copy['iso_3166_2'] == 'UA-43'].copy()
sevastopol = russia_copy[russia_copy['iso_3166_2'] == 'UA-40'].copy()

ukraine_with_crimea = pd.concat([df[df['adm0_a3'] == 'UKR'], crimea, sevastopol], ignore_index=True)

# kyiv = ukraine_with_crimea[ukraine_with_crimea['iso_3166_2'] == 'UA-30']
# kyiv_oblast = ukraine_with_crimea[ukraine_with_crimea['iso_3166_2'] == 'UA-32']

# Update the name of the Kyiv city entry
ukraine_with_crimea.loc[ukraine_with_crimea['iso_3166_2'] == 'UA-30', 'name'] = 'Kyiv'

# # Update the name of the Kyiv Oblast entry
ukraine_with_crimea.loc[ukraine_with_crimea['iso_3166_2'] == 'UA-32', 'name'] = 'Kyiv Oblast'

# Plotting the DataFrame
not speed_run and ukraine_with_crimea.plot(figsize=(7,7), **plot_styles)

India

  • Jammu and Kashmir and Ladakh with ISO-3166 codes IN-JK and IN-LA respectively are new territories of India. They are part of the disputed region of Jammu and Kashmir.
python
india = df[df['admin'] == 'India']
india_copy = india.copy()

# Download and load the GeoJSON file for India
india_geojson_url = "https://github.com/geohacker/india/raw/bcb920c7d3c686f01d085f7661c9ba89bf9bf65e/state/india_state_kashmir_ladakh.geojson"

try:
    india_gdf = gpd.read_file(india_geojson_url)
    # Rename column to 'ST_ID' to 'iso_3166_2' for consistency
    india_gdf.rename(columns={'ST_ID': 'iso_3166_2'}, inplace=True)
    # Update the geometry for the states of Jammu and Kashmir and Ladakh
    india_copy.loc[india_copy['iso_3166_2'] == 'IN-JK', 'geometry'] = india_gdf[india_gdf['iso_3166_2'] == 'IN-JK'].dissolve(by='iso_3166_2').reset_index()["geometry"].values
    india_copy.loc[india_copy['iso_3166_2'] == 'IN-LA', 'geometry'] = india_gdf[india_gdf['iso_3166_2'] == 'IN-LA'].dissolve(by='iso_3166_2').reset_index()["geometry"].values
    print("GeoJSON file for India downloaded and loaded successfully.")
except Exception as e:
    
    print(f"Unable to download or load the GeoJSON file for India. Error: {str(e)}")
    print("Please download the file from the URL and try again.")
python
not speed_run and india_copy.plot(**plot_styles)

Norway

  • Remove NO-X01~ (The uninhabited Bouvet Island) and move Svalbard closer to mainland
python
norway = df[df['adm0_a3'] == 'NOR']
not speed_run and norway.plot(**plot_styles)
python
norway_copy = norway.copy()

norway_copy = norway_copy[norway_copy["iso_3166_2"] != "NO-X01~"]
reposition(norway_copy, norway.name == 'Svalbard', -12, -8, 0.5, 0.5)
#reposition(norway_copy, norway.name == 'Nordland', 10, 0, 2, 2)

not speed_run and norway_copy.plot(**plot_styles)

Portugal

python
portugal = df[df.admin == 'Portugal']
not speed_run and portugal.plot(**plot_styles)
python
portugal_copy = portugal.copy()

reposition(portugal_copy, portugal.name == 'Azores', 11, 0)
reposition(portugal_copy, portugal.name == 'Madeira', 6, 2, simplify=0.015)

not speed_run and portugal_copy.plot(figsize=(8, 8), **plot_styles)

Spain

python
spain = df[df.admin == 'Spain']
not speed_run and spain.plot(**plot_styles)
python
spain_copy = spain.copy()

reposition(spain_copy, spain.name.isin(['Las Palmas', 'Santa Cruz de Tenerife']), 3, 7, 1, 1)

not speed_run and spain_copy.plot(figsize=(8, 8), **plot_styles)

Russia

python
russia = df[df.admin == 'Russia']
not speed_run and russia.plot(**plot_styles)
  • Russia looks off because of Chukchi runs across E180. We need to move the parts on the other side of the map to the right.
python
def shift_geom(geom, cutoff=0):
    border = shapely.geometry.LineString([(cutoff, -90), (cutoff, 90)])
    splitted_geom = shapely.ops.split(geom, border)

    # Create a list to store moved geometries
    moved_geom = []

    # Check if the split operation returned a GeometryCollection
    if isinstance(splitted_geom, shapely.geometry.GeometryCollection):
        # Iterate over each geometry in the GeometryCollection
        for item in splitted_geom.geoms:
            minx, miny, maxx, maxy = item.bounds
            if minx < cutoff:
                # Translate the geometry
                moved_geom.append(shapely.affinity.translate(item, xoff=360 - cutoff))
            else:
                moved_geom.append(item)
    else:
        # If the result is not a GeometryCollection, it means no split occurred
        moved_geom.append(geom)

    # Combine all moved geometries into a single geometry
    return shapely.ops.unary_union(moved_geom)

# Applying the function to the DataFrame
russia_copy = russia.copy()
russia_copy.loc[
    russia.name == 'Chukchi Autonomous Okrug', 'geometry'
] = russia_copy.loc[
    russia.name == 'Chukchi Autonomous Okrug', 'geometry'
].apply(shift_geom)

# Plotting
not speed_run and russia_copy.plot(figsize=(20, 20), **plot_styles)

Turkey

Turkey Regions

python
turkey = df[df.admin == 'Turkey'][['iso_3166_2','geometry']]
not speed_run and turkey.plot(**{key: value for key, value in plot_styles.items() if key != 'column'})
python
# NUTS - 1 Codes for Turkey and correspong region - city names

region_dict = {
 'TR1': ['TR-34'],
 'TR2': ['TR-59', 'TR-22', 'TR-39', 'TR-10', 'TR-17'],
 'TR3': ['TR-35', 'TR-09', 'TR-20', 'TR-48', 'TR-45', 'TR-03', 'TR-43', 'TR-64'],
 'TR4': ['TR-16', 'TR-26', 'TR-11', 'TR-41', 'TR-54', 'TR-81', 'TR-14', 'TR-77'],
 'TR5': ['TR-06', 'TR-42', 'TR-70'],
 'TR6': ['TR-07', 'TR-32', 'TR-15', 'TR-01', 'TR-33', 'TR-31', 'TR-46', 'TR-80'],
 'TR7': ['TR-71', 'TR-68', 'TR-51', 'TR-50', 'TR-40', 'TR-38', 'TR-58', 'TR-66'],
 'TR8': ['TR-67', 'TR-78', 'TR-74', 'TR-37', 'TR-18', 'TR-57', 'TR-55', 'TR-60', 'TR-19', 'TR-05'],
 'TR9': ['TR-61', 'TR-52', 'TR-28', 'TR-53', 'TR-08', 'TR-29'],
 'TRA': ['TR-25', 'TR-24', 'TR-69', 'TR-04', 'TR-36', 'TR-76', 'TR-75'],
 'TRB': ['TR-44', 'TR-23', 'TR-12', 'TR-62', 'TR-65', 'TR-49', 'TR-13', 'TR-30'],
 'TRC': ['TR-27', 'TR-02', 'TR-79', 'TR-63', 'TR-21', 'TR-47', 'TR-72', 'TR-73', 'TR-56']}

# Region names corresponding to NUTS-1

region_name_dict = {'TR1':'İstanbul',
                    'TR2':'Batı Marmara',
                    'TR3':'Ege',
                    'TR4':'Doğu Marmara',
                    'TR5':'Batı Anadolu',
                    'TR6':'Akdeniz',
                    'TR7':'Orta Anadolu',
                    'TR8':'Batı Karadeniz',
                    'TR9':'Doğu Karadeniz',
                    'TRA':'Kuzeydoğu Anadolu',
                    'TRC':'Güneydoğu Anadolu',
                    'TRB':'Ortadoğu Anadolu'
                    }


def create_region_polygons(region_dict, turkey_gdf):
    # Create a reverse dictionary where city codes map to region codes
    city_to_region = {city_code: region_code for region_code, city_codes in region_dict.items() for city_code in city_codes}

    # Create a new column 'REGION' in the GeoDataFrame that maps each city to its region
    turkey_gdf['REGION'] = turkey_gdf['iso_3166_2'].map(city_to_region)

    # Dissolve the GeoDataFrame on the 'REGION' column to combine city polygons into region polygons
    region_gdf = turkey_gdf.dissolve(by='REGION')

    # Reset the index of the new GeoDataFrame
    region_gdf.reset_index(inplace=True)
    
    return region_gdf.drop(columns=['iso_3166_2'])
python
turkey_regions = create_region_polygons(region_dict, turkey)

# Rename 'REGION' column to 'ISO'
turkey_regions = turkey_regions.rename(columns={'REGION': 'iso_3166_2'})

# Map the region_name_dict to a new 'NAME_1' column
turkey_regions['name'] = turkey_regions['iso_3166_2'].map(region_name_dict)
python
not speed_run and turkey_regions.plot(figsize=(10, 7), **plot_styles)

France

python
france = df[df.admin == 'France']
not speed_run and france.plot(**plot_styles)

Move the Overseas departments and regions of France closer to mainland.

Fix some department names and region codes

  • Seien-et-Marne => Seine-et-Marne
  • Haute-Rhin => Haut-Rhin
  • FR-IDF\t => FR-IDF
python
def replace_column(column, df, old, new):
    if old in df[column].values:  
        df.loc[df[column] == old, column] = new
        
replace_column('name', france, 'Seien-et-Marne', 'Seine-et-Marne')
replace_column('name', france, 'Haute-Rhin', 'Haut-Rhin')
replace_column('region_cod', france, 'FR-IDF\t', 'FR-IDF')
python
france_copy = france.copy()
reposition(france_copy, france.name=='Guadeloupe', 57.4, 25.4, 1.5, 1.5)
reposition(france_copy, france.name=='Martinique', 58.4, 27.1, 1.5, 1.5)
reposition(france_copy, france.name=='Guyane française', 52, 37.7, 0.35, 0.35)
reposition(france_copy, france.name=='La Réunion', -55, 62.8, 1.5, 1.5)
reposition(france_copy, france.name=='Mayotte', -43, 54.3, 1.5, 1.5)

not speed_run and france_copy.plot(figsize=(8, 8), **plot_styles)

France Regions

python
france_regions = france_copy[['geometry','region_cod','region']]
python
france_regions = france_regions.dissolve(by=['region_cod', 'region']).reset_index()

france_regions = france_regions.rename(columns={'region': 'name', 'region_cod': 'iso_3166_2'})
python
not speed_run and france_regions.plot(figsize=(10, 7), **plot_styles)

France with Overseas

This step creates a map of France with Overseas, in a friendly layout to see all territories and make them easy to see and interact with.

python
france_overseas = france.copy()
reposition(france_overseas, france.name=='Guadeloupe', 53.2, 29, 1.5, 1.5)
reposition(france_overseas, france.name=='Martinique', 52.8, 27.5, 1.5, 1.5)
reposition(france_overseas, france.name=='Guyane française', 45, 35.5, 0.3, 0.3)
reposition(france_overseas, france.name=='La Réunion', -58.2, 60.5, 1.5, 1.5)
reposition(france_overseas, france.name=='Mayotte', -50.5, 52.2, 2, 2)

# Tahiti
tahiti_data = df[(df['admin'] == 'French Polynesia') & (df['name'] == 'Windward Islands')]
# Remove Rimatuu to avoid confusion with Corsica when displayed on the map
windward_geom = tahiti_data.iloc[0].geometry
filtered_geom = MultiPolygon([geom for i, geom in enumerate(windward_geom.geoms) if i != 1])
# Update the geometry in the tahiti_data DataFrame
tahiti_data.at[tahiti_data.index[0], 'geometry'] = filtered_geom
france_overseas = pd.concat([france_overseas, tahiti_data], ignore_index=True)
reposition(france_overseas, france_overseas.name=='Windward Islands', 158.2, 57.3, 2, 2)

# Kerguelen
kerguelen_data = df[(df['admin'] == 'French Southern and Antarctic Lands') & (df['name'] == 'Archipel des Kerguelen')]
france_overseas = pd.concat([france_overseas, kerguelen_data], ignore_index=True)
reposition(france_overseas, france_overseas.name=='Archipel des Kerguelen', -63.5, 88.5, 0.9, 0.9)

# Wallis and Futuna
wallis_futuna_data = df[(df['admin'] == 'Wallis and Futuna') & (df['name'].isin(['Alo', '`Uvea']))]
reposition(wallis_futuna_data, wallis_futuna_data.name=='Alo', 11.3, 1.1)
reposition(wallis_futuna_data, wallis_futuna_data.name=='`Uvea', 9.5, 0.2)
wallis_futuna_merged = wallis_futuna_data.dissolve(by='admin').reset_index()
france_overseas = pd.concat([france_overseas, wallis_futuna_merged], ignore_index=True)
reposition(france_overseas, france_overseas.admin=='Wallis and Futuna', 170, 52.5, 4, 4)

# New Caledonia
new_caledonia_data = df[(df['admin'] == 'New Caledonia')]
new_caledonia_merged = new_caledonia_data.dissolve(by='admin').reset_index()
france_overseas = pd.concat([france_overseas, new_caledonia_merged], ignore_index=True)
reposition(france_overseas, france_overseas.admin=='New Caledonia', -165.5, 60.4, 0.4, 0.4)

# Saint Pierre and Miquelon
saint_pierre_and_miquelon_data = df[((df['admin'] == 'Saint Pierre and Miquelon'))]
saint_pierre_and_miquelon_merged = saint_pierre_and_miquelon_data.dissolve(by='admin').reset_index()
france_overseas = pd.concat([france_overseas, saint_pierre_and_miquelon_merged], ignore_index=True)
reposition(france_overseas, france_overseas.admin=='Saint Pierre and Miquelon', 48, 4, 3, 3)

# Saint Martin
saint_martin_data = df[(df['admin'] == 'Saint Martin')]
france_overseas = pd.concat([france_overseas, saint_martin_data], ignore_index=True)
reposition(france_overseas, france_overseas.admin=='Saint Martin', 54.8, 30.3, 5, 5)

# Saint Barthélémy
saint_barthelemy_data = df[(df['admin'] == 'Saint Barthelemy')]
france_overseas = pd.concat([france_overseas, saint_barthelemy_data], ignore_index=True)
reposition(france_overseas, france_overseas.admin=='Saint Barthelemy', 54.5, 30, 8, 8)

# Reposition Paris, and Departements 92 93 94 so that we can actually see them
paris_and_littlecrowndpts = france_overseas[france_overseas['name'].isin(['Paris', 'Hauts-de-Seine', 'Seine-Saint-Denis', 'Val-de-Marne'])]
grouped_geometry = MultiPolygon(paris_and_littlecrowndpts['geometry'].tolist())
grouped_geometry_transformed = shapely.affinity.scale(shapely.affinity.translate(grouped_geometry, xoff=6.3, yoff=2.3), xfact=3, yfact=3)
transformed_geometries = list(grouped_geometry_transformed.geoms)
paris_and_littlecrowndpts_copy = paris_and_littlecrowndpts.copy()
paris_and_littlecrowndpts_copy['geometry'] = transformed_geometries
france_overseas = france_overseas[~france_overseas['name'].isin(['Paris', 'Hauts-de-Seine', 'Seine-Saint-Denis', 'Val-de-Marne'])]
france_overseas = pd.concat([france_overseas, paris_and_littlecrowndpts_copy], ignore_index=True)

# Update metadata properly
france_overseas.loc[france_overseas['name'] == 'Windward Islands', ['name', 'iso_3166_2']] = ['Polynésie française', 'FR-PF']
france_overseas.loc[france_overseas['name'] == 'Archipel des Kerguelen', ['name', 'iso_3166_2']] = ['Terres australes et antarctiques françaises', 'FR-TF']
france_overseas.loc[france_overseas['admin'] == 'Wallis and Futuna', ['name', 'iso_3166_2']] = ['Wallis et Futuna', 'FR-WF']
france_overseas.loc[france_overseas['admin'] == 'New Caledonia', ['name', 'iso_3166_2']] = ['Nouvelle-Calédonie', 'FR-NC']
france_overseas.loc[france_overseas['admin'] == 'Saint Pierre and Miquelon', ['name', 'iso_3166_2']] = ['Saint-Pierre-et-Miquelon', 'FR-PM']
france_overseas.loc[france_overseas['admin'] == 'Saint Martin', ['name', 'iso_3166_2']] = ['Saint-Martin', 'FR-MF']
france_overseas.loc[france_overseas['admin'] == 'Saint Barthelemy', ['name', 'iso_3166_2']] = ['Saint-Barthélémy', 'FR-BL']

# Plot data
france_overseas = france_overseas.rename(columns={'NAME_1': 'name','ISO': 'iso_3166_2'})
not speed_run and france_overseas.plot(figsize=(15, 15), **plot_styles)

Italy

Italy Regions

python
italy_regions = df[df.admin == 'Italy'][['geometry','region_cod','region']]

italy_regions = italy_regions.dissolve(by=['region_cod', 'region']).reset_index()

italy_regions = italy_regions.rename(columns={'region': 'name', 'region_cod': 'iso_3166_2'})
python
not speed_run and italy_regions.plot(figsize=(10, 7), **plot_styles)

Netherlands

python
def apply_bounds(df, northwest, southeast):
    x1, y1 = northwest
    x2, y2 = southeast
    boundry = shapely.geometry.Polygon([(x1, y1),(x1, y2), (x2, y2), (x2, y1)])
    df = df.copy()
    return df[df.geometry.apply(lambda x: boundry.contains(x))]
python
netherlands = df[df.admin == 'Netherlands']
not speed_run and netherlands.plot(**plot_styles)
python
netherlands_copy = apply_bounds(netherlands, (-20, 60), (20, 20))
not speed_run and netherlands_copy.plot(figsize=(8, 8), **plot_styles)

Latvia

The administrative division in the Natural Earth dataset is outdated since the 2021 subdivision reform, see https://en.wikipedia.org/wiki/Administrative_divisions_of_Latvia.

python
latvia_copy = df[df.admin == 'Latvia'].copy()
latvia_geojson_url = "https://raw.githubusercontent.com/eriks47/latvia/main/latvia.geojson"

try:
    latvia_gdf = gpd.read_file(latvia_geojson_url)
    latvia_copy = gpd.GeoDataFrame(
        latvia_gdf,
        geometry='geometry',
        crs=latvia_gdf.crs
    )
    
    print("GeoJSON file for Latvia downloaded and loaded successfully.")
except Exception as e:
    print(f"Unable to download or load the GeoJSON file for Latvia. Error: {str(e)}")
    print("Please download the file from the URL and try again.")

not speed_run and latvia_copy.plot(**plot_styles)

UK

python
uk = df[df.admin == 'United Kingdom']
not speed_run and uk.plot(**plot_styles)
python
uk_copy = apply_bounds(uk, (-10, 60), (20, 20))
not speed_run and uk_copy.plot(figsize=(8, 8), **plot_styles)

Philippines

  • Merges highly urbanized cities (HUCs) into their respective geographic provinces
  • Updates provinces:
    • Forms Maguindanao del Norte and Maguindanao del Sur from the former Maguindanao province
    • Splits off Davao Occidental from Davao del Sur
    • Renames Compostella Valley to Davao de Oro
python
ph_url = "https://raw.githubusercontent.com/jdruii/phgeojson/main/philippines.geojson"
philippines_copy = gpd.read_file(ph_url)
python
philippines_copy = philippines_copy.rename(columns={'NAME_1': 'name','ISO': 'iso_3166_2'})
not speed_run and philippines_copy.plot(**plot_styles)

Philippines Regions

  • Adds regional map
    • Fixes outdated names
python
philippines_regions = df[df.admin == 'Philippines'][['geometry','region_cod','region']]
python
philippines_regions = philippines_regions.dissolve(by=['region_cod', 'region']).reset_index()
philippines_regions = philippines_regions.rename(columns={'region': 'name', 'region_cod': 'iso_3166_2'})
python
philippines_regions['name'] = philippines_regions['name'].replace({
    'Dinagat Islands (Region XIII)': 'Caraga Administrative Region (Region XIII)',
    'Autonomous Region in Muslim Mindanao (ARMM)': 'Bangsamoro Autonomous Region in Muslim Mindanao (BARMM)'
})
python
not speed_run and philippines_regions.plot(figsize = (10, 7), **plot_styles)

Vietnam

  • Updates provinces/cities name
python
vietnam = df[df.admin == 'Vietnam']
vietnam_copy = vietnam.copy()
replace_column('name', vietnam_copy, 'Ðong Tháp', 'Đồng Tháp')
replace_column('name', vietnam_copy, 'Son La', 'Sơn La')
replace_column('name', vietnam_copy, 'Ha Tinh', 'Hà Tĩnh')
replace_column('name', vietnam_copy, 'Quàng Nam', 'Quảng Nam')
replace_column('name', vietnam_copy, 'Lai Chau', 'Lai Châu')
replace_column('name', vietnam_copy, 'Hồ Chí Minh city', 'Thành phố Hồ Chí Minh')
replace_column('name', vietnam_copy, 'Hau Giang', 'Hậu Giang')
replace_column('name', vietnam_copy, 'Ha Noi', 'Hà Nội')
replace_column('name', vietnam_copy, 'Can Tho', 'Cần Thơ')
replace_column('name', vietnam_copy, 'Đông Nam Bộ', 'Đồng Nai')
replace_column('name', vietnam_copy, 'Đông Bắc', 'Bắc Kạn')
replace_column('name', vietnam_copy, 'Đồng Bằng Sông Hồng', 'Hưng Yên')
for i in vietnam_copy['name']:
    print(i)

Output GeoJSON Files

python
# Gather up all the tweaked maps!
alt_maps = {
    "finland": finland_copy,
    "china": china_copy,
    "usa": usa_copy,
    "france": france_copy,
    "france_regions": france_regions,
    "france_overseas": france_overseas,
    "turkey_regions": turkey_regions,
    "italy_regions": italy_regions,
    "philippines_regions": philippines_regions,
    "latvia": latvia_copy,
    "netherlands": netherlands_copy,
    "norway": norway_copy,
    "uk": uk_copy,
    "russia": russia_copy,
    "spain": spain_copy,
    "portugal": portugal_copy,
    "ukraine": ukraine_with_crimea,
    "india": india_copy,
    "vietnam": vietnam_copy
}


python
# Filter out countries that only have one region, making them effectively useless as a choropleth

def get_num_subdivisions(country):
    gdf = get_gdf(country)
    subdivisions = gdf['iso_3166_2'].unique()
    if len(subdivisions) == 1:
        print(country, "has only one subdivision - removing from countries array")
    return len(subdivisions)

# we add the unnecessaery countries to a list here, for clearing out unneeded geojson files later
countries_to_purge = [country for country in countries if get_num_subdivisions(country) <= 1]

# now we purge those from our main "countries" list to continue processing
countries = [country for country in countries if get_num_subdivisions(country) > 1]
python
simplify_factors = {
    "uk": 0.005,
}
useful_columns = ["ISO", "NAME_1", "geometry"]

def get_simplify_factor_by_size(gdf):
    xmin, ymin, xmax, ymax = shapely.ops.unary_union(gdf["geometry"]).bounds
    size = (xmax - xmin) * (ymax - ymin)
    print("Size", round(size, 3), end="\t")
    if size > 1000: return 0.03
    if size > 300: return 0.02
    if size > 100: return 0.01
    return 0

def simplify_if_needed(country, gdf):
    """Simplify the maps based on country size"""
    country_alias = country_name_aliases.get(country, country)
    if country_alias in df_50m["admin"].str.lower().unique():
        return

    factor = simplify_factors.get(country) or get_simplify_factor_by_size(gdf)

    if factor:
        gdf["geometry"] = gdf.simplify(factor)

def save_geojson(country):
    gdf = get_gdf(country)
    print(country, end="\t")

    # For backward compatibility
    gdf["ISO"] = gdf["iso_3166_2"]
    gdf["NAME_1"] = gdf["name"]

    simplify_if_needed(country, gdf)

    print(f'Saving geojson for {country}...')
    filename_country = country.replace(' ', '_')
    gdf[useful_columns].to_file(f"../src/countries/{filename_country}.geojson", driver="GeoJSON")

for country in countries_to_purge:
    filename_country = country.replace(' ', '_')
    filepath = f"../src/countries/{filename_country}.geojson"
    if os.path.exists(filepath):
        os.remove(filepath)
        print(f"Purged {filepath} since it has only one region")

for country in countries:
    save_geojson(country)

# this overwrites some of the above... could be optimized
for country in alt_maps:
    save_geojson(country)

print("Done.                          ")
python
not speed_run and plot_all_countries(countries)
python
not speed_run and plot_all_countries(alt_maps)

Output Typescript for Control Panel & JSON for Docs Site

python


# Function to convert country name to a valid JavaScript identifier
def to_js_identifier(name):
    return name.replace(' ', '_').replace('-', '_')

# License boilerplate
license_boilerplate = """/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */
"""

countries_combined = list(set([*countries, *alt_maps.keys()]))
countries_combined = sorted(countries_combined)

# Generate TypeScript import statements
imports = "\n".join([f"import {to_js_identifier(country)} from './countries/{to_js_identifier(country)}.geojson';" for country in countries_combined])

# Generate the export object
exports = "export const countries = {\n  " + ",\n  ".join([to_js_identifier(country) for country in countries_combined]) + ",\n};"

# Additional exports
additional_exports = """
export const countryOptions = Object.keys(countries).map(x => {
  if (x === 'uk' || x === 'usa') {
    return [x, x.toUpperCase()];
  }
  if (x === 'italy_regions') {
    return [x, 'Italy (regions)'];
  }
  if (x === 'france_regions') {
    return [x, 'France (regions)'];
  }
  if (x === 'france_overseas') {
    return [x, 'France (with overseas)'];
  }
  if (x === 'turkey_regions') {
    return [x, 'Turkey (regions)'];
  }
  return [
    x,
    x
      .split('_')
      .map(e => e[0].toUpperCase() + e.slice(1))
      .join(' '),
  ];
});

export default countries;
"""

# Combine license, imports, exports, and additional exports
typescript_code = f"{license_boilerplate}\n{imports}\n\n{exports}\n{additional_exports}"

# Write to a file
with open("../src/countries.ts", "w") as file:
    file.write(typescript_code)

print("TypeScript code written to src/countries.ts")

# DOCS JSON:
# Replace underscores with spaces and title-case each country name
formatted_countries = [country.replace("_", " ") for country in countries_combined]
formatted_countries = [country.upper() if country in {"usa", "uk"} else country.title() for country in formatted_countries]
formatted_countries = [country.replace(" Regions"," (regions)") for country in formatted_countries]
formatted_countries = [country.replace(" Overseas"," (with overseas)") for country in formatted_countries]


# Create a dictionary in the desired format
data = {"countries": formatted_countries}
# Convert the dictionary to a JSON string with proper formatting
json_data = json.dumps(data, indent=2) + "\n"

# Write to a file
with open("../../../../docs/data/countries.json", "w") as file:
    file.write(json_data)

print("JSON written to docs/data/countries.json")