I found growth curves for girls and boys in Israel:

For example, see this:

I used the great online resource Web Plot Digitizer v4 to extract the data from the images files. I captured all the growth curves as best as I could. The first step now is to get interpolated versions of the digitized data. For instance, see below the 50th percentile for boys:

import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="ticks", font_scale=1.5)
from scipy.optimize import curve_fit
from scipy.special import erf
from scipy.interpolate import UnivariateSpline
import matplotlib.animation as animation
from scipy.stats import norm
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'notebook'
# %matplotlib widget
define useful arrays
age_list = np.round(np.arange(2.0, 20.1, 0.1), 1)
height_list = np.round(np.arange(70, 220, 0.1), 1)
import sample data, boys 50th percentile
df_temp_boys_50th = pd.read_csv('../archive/data/height/boys-p50.csv', names=['age','height'])
spline = UnivariateSpline(df_temp_boys_50th['age'], df_temp_boys_50th['height'], s=0.5)
interpolated = spline(age_list)
plot
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(df_temp_boys_50th['age'], df_temp_boys_50th['height'], label='digitized data',
        marker='o', markerfacecolor='None', markeredgecolor="black", markersize=6, linestyle='None')
ax.plot(age_list, interpolated, label='interpolated', color="black", linewidth=2)
ax.set(xlabel='age (years)',
       ylabel='height (cm)',
       xticks=np.arange(2, 21, 2),
       title="boys, 50th percentile"
       )
ax.legend(frameon=False);

Let’s do the same for all the other curves, and then save them to a file.

interpolate all growth curves
col_names = ['p05', 'p10', 'p25', 'p50', 'p75', 'p90', 'p95']
file_names_boys = ['boys-p05.csv', 'boys-p10.csv', 'boys-p25.csv', 'boys-p50.csv',
                   'boys-p75.csv', 'boys-p90.csv', 'boys-p95.csv',]
file_names_girls = ['girls-p05.csv', 'girls-p10.csv', 'girls-p25.csv', 'girls-p50.csv',
                   'girls-p75.csv', 'girls-p90.csv', 'girls-p95.csv',]

# create dataframe with age column
df_boys = pd.DataFrame({'age': age_list})
df_girls = pd.DataFrame({'age': age_list})
# loop over file names and read in data
for i, file_name in enumerate(file_names_boys):
    # read in data
    df_temp = pd.read_csv('../archive/data/height/' + file_name, names=['age','height'])
    spline = UnivariateSpline(df_temp['age'], df_temp['height'], s=0.5)
    df_boys[col_names[i]] = spline(age_list)
for i, file_name in enumerate(file_names_girls):
    # read in data
    df_temp = pd.read_csv('../archive/data/height/' + file_name, names=['age','height'])
    spline = UnivariateSpline(df_temp['age'], df_temp['height'], s=0.5)
    df_girls[col_names[i]] = spline(age_list)

# make age index
df_boys.set_index('age', inplace=True)
df_boys.index = df_boys.index.round(1)
df_boys.to_csv('../archive/data/height/boys_height_vs_age_combined.csv', index=True)
df_girls.set_index('age', inplace=True)
df_girls.index = df_girls.index.round(1)
df_girls.to_csv('../archive/data/height/girls_height_vs_age_combined.csv', index=True)

Let’s take a look at what we just did.

df_girls
p05 p10 p25 p50 p75 p90 p95
age
2.0 79.269087 80.794167 83.049251 85.155597 87.475854 89.779822 90.882059
2.1 80.202106 81.772053 84.052858 86.207778 88.713405 90.883740 92.409913
2.2 81.130687 82.706754 85.011591 87.211543 89.856186 91.940642 93.416959
2.3 82.048325 83.601023 85.928399 88.170313 90.914093 92.953965 94.270653
2.4 82.948516 84.457612 86.806234 89.087509 91.897022 93.927147 95.226089
... ... ... ... ... ... ... ...
19.6 152.520938 154.812286 158.775277 163.337149 167.699533 171.531349 173.969235
19.7 152.534223 154.814440 158.791925 163.310864 167.704618 171.519600 173.980150
19.8 152.548001 154.827666 158.815071 163.275852 167.708562 171.504730 173.990964
19.9 152.562338 154.853760 158.845506 163.231563 167.711342 171.486629 174.001704
20.0 152.577300 154.894521 158.884019 163.177444 167.712936 171.465189 174.012396

181 rows × 7 columns

show all interpolated curves for girls
fig, ax = plt.subplots(figsize=(8, 6))
# loop over col_names and plot each column
colors = sns.color_palette("Oranges", len(col_names))
for col, color in zip(col_names, colors):
    ax.plot(df_girls.index, df_girls[col], label=col, color=color)
ax.set(xlabel='age (years)',
       ylabel='height (cm)',
       xticks=np.arange(2, 21, 2),
       title="growth curves for girls\npercentile curves: 5, 10, 25, 50, 75, 90, 95",
       );

Let’s now see the percentiles for girls age 20.

plot cdf for girls, age 20
fig, ax = plt.subplots(figsize=(8, 6))
percentile_list = np.array([5, 10, 25, 50, 75, 90, 95])
data = df_girls.loc[20.0]
ax.plot(data, percentile_list, ls='', marker='o', markersize=6, color="black")
ax.set(xlabel='height (cm)',
         ylabel='percentile',
         yticks=percentile_list,
         title="cdf for girls, age 20"
         );

I suspect that the heights in the population are normally distributed. Let’s check that. I’ll fit the data to the integral of a gaussian, because the percentiles correspond to a cdf. If a pdf is a gaussian, its cumulative is given by

\Phi(x) = \frac{1}{2} \left( 1 + \text{erf}\left(\frac{x - \mu}{\sigma \sqrt{2}}\right) \right)

where \mu is the mean and \sigma is the standard deviation of the distribution. The error function \text{erf} is a sigmoid function, which is a good approximation for the cdf of the normal distribution.

define functions
def erf_model(x, mu, sigma):
    return 50 * (1 + erf((x - mu) / (sigma * np.sqrt(2))) )
# initial guess for parameters: [mu, sigma]
p0 = [150, 6]
# Calculate R-squared
def calculate_r2(y_true, y_pred):
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return 1 - (ss_res / ss_tot)
fit model to data
data = df_girls.loc[20.0]
params, _ = curve_fit(erf_model, data, percentile_list, p0=p0,
                        bounds=([100, 3],   # lower bounds for mu and sigma
                                [200, 10])  # upper bounds for mu and sigma
                        )
# store the parameters in the dataframe
percentile_predicted = erf_model(data, *params)
# R-squared value
r2 = calculate_r2(percentile_list, percentile_predicted)
show results
fig, ax = plt.subplots(figsize=(8, 6))
percentile_list = np.array([5, 10, 25, 50, 75, 90, 95])
data = df_girls.loc[20.0]
ax.plot(data, percentile_list, ls='', marker='o', markersize=6, color="black", label='data')
fit = erf_model(height_list, *params)
ax.plot(height_list, fit, label='fit', color="red", linewidth=2)
ax.text(150, 75, f'$\mu$ = {params[0]:.1f} cm\n$\sigma$ = {params[1]:.1f} cm\nR$^2$ = {r2:.6f}',
        fontsize=14, bbox=dict(facecolor='white', alpha=0.5))
ax.legend(frameon=False)
ax.set(xlabel='height (cm)',
       xlim=(140, 190),
         ylabel='percentile',
         yticks=percentile_list,
         title="the data is very well fitted by a normal distribution"
         );

Another way of making sure that the model fits the data is to make a QQ plot. In this plot, the quantiles of the data are plotted against the quantiles of the normal distribution. If the data is normally distributed, the points should fall on a straight line.

show QQ plot
fitted_quantiles = norm.cdf(data, loc=params[0], scale=params[1])
experimental_quantiles = percentile_list / 100
fig, ax = plt.subplots(figsize=(8, 6))
ax.set_aspect('equal', adjustable='box')
ax.plot(experimental_quantiles, fitted_quantiles,
        ls='', marker='o', markersize=6, color="black",
        label='qq points')
ax.plot([0, 1], [0, 1], color='red', linewidth=2, label="1:1 line")
ax.set(xlabel='empirical quantiles',
       ylabel='fitted quantiles',
       xlim=(0, 1),
       ylim=(0, 1),
       title="QQ plot")
ax.legend(frameon=False)

Great, now we just need to do exactly the same for both sexes, and all the ages. I chose to divide age from 2 to 20 into 0.1 intervals.

create dataframes to store the parameters mu, sigma, r2
df_stats_boys = pd.DataFrame(index=age_list, columns=['mu', 'sigma', 'r2'])
df_stats_boys['mu'] = 0.0
df_stats_boys['sigma'] = 0.0
df_stats_boys['r2'] = 0.0
df_stats_girls = pd.DataFrame(index=age_list, columns=['mu', 'sigma', 'r2'])
df_stats_girls['mu'] = 0.0
df_stats_girls['sigma'] = 0.0
df_stats_girls['r2'] = 0.0
fit model to all the data
p0 = [80, 3]
# loop over ages in the index, calculate mu and sigma
for i in df_boys.index:
    # fit the model to the data
    data = df_boys.loc[i]
    params, _ = curve_fit(erf_model, data, percentile_list, p0=p0,
                          bounds=([70, 2],   # lower bounds for mu and sigma
                                  [200, 10])  # upper bounds for mu and sigma
                         )
    # store the parameters in the dataframe
    df_stats_boys.at[i, 'mu'] = params[0]
    df_stats_boys.at[i, 'sigma'] = params[1]
    percentile_predicted = erf_model(data, *params)
    # R-squared value
    r2 = calculate_r2(percentile_list, percentile_predicted)
    df_stats_boys.at[i, 'r2'] = r2
    p0 = params
# same for girls
p0 = [80, 3]
for i in df_girls.index:
    # fit the model to the data
    data = df_girls.loc[i]
    params, _ = curve_fit(erf_model, data, percentile_list, p0=p0,
                          bounds=([70, 3],   # lower bounds for mu and sigma
                                  [200, 10])  # upper bounds for mu and sigma
                         )
    # store the parameters in the dataframe
    df_stats_girls.at[i, 'mu'] = params[0]
    df_stats_girls.at[i, 'sigma'] = params[1]
    percentile_predicted = erf_model(data, *params)
    # R-squared value
    r2 = calculate_r2(percentile_list, percentile_predicted)
    df_stats_girls.at[i, 'r2'] = r2
    p0 = params

# save the dataframes to csv files
df_stats_boys.to_csv('../archive/data/height/boys_height_stats.csv', index=True)
df_stats_girls.to_csv('../archive/data/height/girls_height_stats.csv', index=True)

Let’s see what we got. The top panel in the graph shows the average height for boys and girls, the middle panel shows the coefficient of variation (\sigma/\mu), and the bottom panel shows the R2 of the fit (note that the range is very close to 1).

df_stats_boys
mu sigma r2
2.0 86.463069 3.563785 0.999511
2.1 87.374895 3.596583 0.999676
2.2 88.269676 3.627433 0.999742
2.3 89.148086 3.657263 0.999752
2.4 90.010783 3.686764 0.999733
... ... ... ...
19.6 176.802810 7.134561 0.999991
19.7 176.845789 7.135786 0.999994
19.8 176.892196 7.137430 0.999995
19.9 176.942521 7.139466 0.999990
20.0 176.997255 7.141858 0.999976

181 rows × 3 columns

plot results
fig, ax = plt.subplots(3,1, figsize=(8, 10), sharex=True)
fig.subplots_adjust(left=0.15)
ax[0].plot(df_stats_boys['mu'], label='boys', lw=2)
ax[0].plot(df_stats_girls['mu'], label='girls', lw=2)
ax[0].legend(frameon=False)

ax[1].plot(df_stats_boys['sigma'] / df_stats_boys['mu'], lw=2)
ax[1].plot(df_stats_girls['sigma'] / df_stats_girls['mu'], lw=2)

ax[2].plot(df_stats_boys.index, df_stats_boys['r2'], label=r'$r2$ boys', lw=2)
ax[2].plot(df_stats_girls.index, df_stats_girls['r2'], label=r'$r2$ girls', lw=2)

ax[0].set(ylabel='average height (cm)',)
ax[1].set(ylabel='CV',
          ylim=[0,0.055])
ax[2].set(xlabel='age (years)',
            ylabel=r'$R^2$',
            xticks=np.arange(2, 21, 2),
          );

Let’s see how the pdfs for boys and girls move and morph as age increases.

produce dataframes for pre-calculated pdfs
age_list_string = age_list.astype(str).tolist()
df_pdf_boys = pd.DataFrame(index=height_list, columns=age_list_string)
df_pdf_girls = pd.DataFrame(index=height_list, columns=age_list_string)

for age in df_pdf_boys.columns:
    age_float = round(float(age), 1)
    df_pdf_boys[age] = norm.pdf(height_list,
                                loc=df_stats_boys.loc[age_float]['mu'],
                                scale=df_stats_boys.loc[age_float]['sigma'])
for age in df_pdf_girls.columns:
    age_float = round(float(age), 1)
    df_pdf_girls[age] = norm.pdf(height_list,
                                loc=df_stats_girls.loc[age_float]['mu'],
                                scale=df_stats_girls.loc[age_float]['sigma'])
df_pdf_girls
2.0 2.1 2.2 2.3 2.4 2.5 2.6 2.7 2.8 2.9 ... 19.1 19.2 19.3 19.4 19.5 19.6 19.7 19.8 19.9 20.0
70.0 0.000006 2.962419e-06 1.229580e-06 4.740717e-07 1.893495e-07 7.928033e-08 3.395629e-08 1.454961e-08 6.214658e-09 2.698367e-09 ... 3.876760e-46 4.998212e-46 6.108274e-46 6.965756e-46 7.300518e-46 6.928073e-46 5.866310e-46 4.367574e-46 2.817087e-46 1.550490e-46
70.1 0.000007 3.369929e-06 1.401926e-06 5.423176e-07 2.172465e-07 9.118694e-08 3.914667e-08 1.681357e-08 7.199311e-09 3.133161e-09 ... 4.821662e-46 6.212999e-46 7.589544e-46 8.652519e-46 9.067461e-46 8.605908e-46 7.289698e-46 5.430839e-46 3.506265e-46 1.932327e-46
70.2 0.000008 3.830459e-06 1.597215e-06 6.199308e-07 2.490751e-07 1.048086e-07 4.509972e-08 1.941687e-08 8.334521e-09 3.635676e-09 ... 5.995467e-46 7.721230e-46 9.427830e-46 1.074523e-45 1.125944e-45 1.068759e-45 9.056344e-46 6.751373e-46 4.363019e-46 2.407630e-46
70.3 0.000009 4.350475e-06 1.818328e-06 7.081296e-07 2.853621e-07 1.203810e-07 5.192270e-08 2.240831e-08 9.642428e-09 4.216078e-09 ... 7.453283e-46 9.593350e-46 1.170864e-45 1.334099e-45 1.397806e-45 1.326973e-45 1.124851e-45 8.391039e-46 5.427845e-46 2.999137e-46
70.4 0.000010 4.937172e-06 2.068480e-06 8.082806e-07 3.267014e-07 1.381707e-07 5.973725e-08 2.584341e-08 1.114829e-08 4.885994e-09 ... 9.263403e-46 1.191661e-45 1.453785e-45 1.655996e-45 1.734906e-45 1.647188e-45 1.396806e-45 1.042648e-45 6.750965e-46 3.735083e-46
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
219.5 0.000000 5.214425e-307 1.377605e-289 3.568527e-277 6.457994e-266 2.232144e-255 6.340272e-246 9.969867e-238 1.389324e-230 5.441854e-224 ... 5.200570e-18 5.741874e-18 6.194642e-18 6.495054e-18 6.583545e-18 6.417949e-18 5.986319e-18 5.315101e-18 4.468701e-18 3.538724e-18
219.6 0.000000 1.813597e-307 5.050074e-290 1.356408e-277 2.537010e-266 9.046507e-256 2.642444e-246 4.256155e-238 6.055129e-231 2.417510e-224 ... 4.558798e-18 5.035058e-18 5.433557e-18 5.698034e-18 5.775970e-18 5.630212e-18 5.250299e-18 4.659675e-18 3.915265e-18 3.097919e-18
219.7 0.000000 6.302763e-308 1.849870e-290 5.151948e-278 9.959447e-267 3.663840e-256 1.100546e-246 1.815751e-238 2.637298e-231 1.073274e-224 ... 3.995288e-18 4.414220e-18 4.764871e-18 4.997654e-18 5.066279e-18 4.938013e-18 4.603699e-18 4.084117e-18 3.429566e-18 2.711382e-18
219.8 0.000000 2.188653e-308 6.771033e-291 1.955386e-278 3.906942e-267 1.482823e-256 4.580523e-247 7.741154e-239 1.147918e-231 4.761829e-225 ... 3.500614e-18 3.869030e-18 4.177503e-18 4.382343e-18 4.442754e-18 4.329907e-18 4.035791e-18 3.578814e-18 3.003413e-18 2.372514e-18
219.9 0.000000 7.594139e-309 2.476504e-291 7.416066e-279 1.531537e-267 5.997065e-257 1.905138e-247 3.298116e-239 4.993198e-232 2.111339e-225 ... 3.066470e-18 3.390384e-18 3.661688e-18 3.841895e-18 3.895062e-18 3.795805e-18 3.537115e-18 3.135297e-18 2.629596e-18 2.075507e-18

1500 rows × 181 columns

plotly widget
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default = 'notebook'

# create figure
fig = go.Figure()

# assume both dataframes have the same columns (ages) and index (height)
ages = df_pdf_boys.columns
x_vals = df_pdf_boys.index

# add traces: 2 per age (boys and girls), all hidden except the first pair
for i, age in enumerate(ages):
    fig.add_trace(go.Scatter(x=x_vals, y=df_pdf_boys[age], name=f'Boys {age}', 
                             line=dict(color='#1f77b4'), visible=(i == 0)))
    fig.add_trace(go.Scatter(x=x_vals, y=df_pdf_girls[age], name=f'Girls {age}', 
                             line=dict(color='#ff7f0e'), visible=(i == 0)))

# create slider steps
steps = []
for i, age in enumerate(ages):
    vis = [False] * (2 * len(ages))
    vis[2*i] = True      # boys trace
    vis[2*i + 1] = True  # girls trace

    steps.append(dict(
        method='update',
        args=[{'visible': vis},
              {'title': f'Height Distribution - Age: {age}'}],
        label=str(age)
    ))

# define slider
sliders = [dict(
    active=0,
    currentvalue={"prefix": "Age: "},
    pad={"t": 50},
    steps=steps
)]

# update layout
fig.update_layout(
    sliders=sliders,
    title='Height Distribution by Age',
    xaxis_title='Height (cm)',
    yaxis_title='Density',
    yaxis=dict(range=[0, 0.12]),
    showlegend=True,
    height=600,
    width=800
)

fig.show()

A few notes about what we can learn from the analysis above.

I’ll plot one last graph from now, let’s see what we can learn from it. Let’s see the pdf for boys and girls across three age groups: 8, 12, and 15 year olds.

comparison across three ages
fig, ax = plt.subplots(3, 1, figsize=(8, 12), sharex=True)
fig.subplots_adjust(hspace=0.1)
ages_to_plot = [8.0, 12.0, 15.0]

for i, age in enumerate(ages_to_plot):
    pdf_boys = norm.pdf(height_list, loc=df_stats_boys.loc[age]['mu'], scale=df_stats_boys.loc[age]['sigma'])
    pdf_girls = norm.pdf(height_list, loc=df_stats_girls.loc[age]['mu'], scale=df_stats_girls.loc[age]['sigma'])
    ax[i].plot(height_list, pdf_boys, label='boys', color='tab:blue')
    ax[i].plot(height_list, pdf_girls, label='girls', color='tab:orange')
    ax[i].text(0.98, 0.98, f'age: {age} years', transform=ax[i].transAxes, verticalalignment='top', horizontalalignment='right')
    ax[i].set(ylabel='pdf',
              ylim=(0, 0.07),
            )
ax[2].legend(frameon=False)
ax[2].set(xlabel='height (cm)',
          xlim=(100, 200),);