My first race since the Spine is coming up, and whilst I’m not exactly at peak fitness, I thought it would be interesting to have a look at past results, to get an idea of how long the typical person takes.
If nothing else, it’s an excuse to play around with the tools I normally use at work, but with something less dry than financial time series.
I grabbed the raw data from the mighty D-U-V Statistik, the full code is at the end of the post.
Quick Check
Here’s a quick look at the data, to make sure it looks sensible:
M Winners
year age full_name country time mins_per_mile mins_per_k
1 2013 35 Perkins, Mark GBR 6:55:37 h 8:19 5:12
125 2014 30 Navesey, Paul GBR 6:11:28 h 7:26 4:39
426 2015 22 Mound, Victor GBR 5:53:19 h 7:04 4:25
F Winners
year age full_name country time mins_per_mile mins_per_k
8 2013 38 Canvin, Emily GBR 8:23:30 h 10:04 6:18
133 2014 37 Sutton, Edwina GBR 7:09:21 h 8:35 5:22
432 2015 33 Morwood, Sarah GBR 7:19:03 h 8:47 5:29
So far, not all that interesting (but impressive), although seeing the top 3 in terms of average pace is quite illuminating. Victor ran almost the equivalent of two back to back 3 hour marathons, over a hilly trail route.
Lets look at some histograms.
Pace Distribution

Faster than walking pace, but nothing too crazy.
What does it take to get a top 10 spot I wonder?
Top 10, pace must be faster than 9:20 m/m
Top 10, pace must be faster than 5:50 m/k
Pretty brisk then, considering the South Downs Way has some steep bits!
Age Distribution
To finish off, lets have a quick look at the age distribution, mostly because I’m feeling old (turn 40 this year), and I’m hoping that at least I’ll be able to use that as an excuse for being slow.
OH, that was’t what I expected…bit late to do any more training, oh well.
For The Nerds
# Stick all the imports in one place, for clarity
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
from collections import defaultdict, OrderedDict
from os.path import join, splitext
from os import listdir
from datetime import datetime, timedelta
from functools import partial
def duration2hours(t):
"""Convert a duration string (eg '6:55:37 h') to a number of hours"""
ts = t.split(" ")[0].split(":")
td = timedelta(hours=int(ts[0]), minutes=int(ts[1]), seconds=int(ts[2]))
return td.seconds / float(60 * 60)
def speed(hours=None, distance=None):
return distance / hours
def fmt_mins_per_unit(mins_per_unit):
"""Returns a recognisable pace string, eg 6:55"""
return "{0}:{1:0>2.0f}".format(int(mins_per_unit), 60 * (mins_per_unit - int(mins_per_unit)))
def speed2pace(speed):
mins_per_unit = 60 / speed
return mins_per_unit
def load_all():
root = "race_data"
# Use our own headers, that are easier to refer to in code
headers = {"Rank": "position",
"Performance": "time",
"Club": "club",
"Nat": "country",
"M/F": "gender",
"Rank M/F": "gender_position",
"Cat": "category",
"Cat Rank": "category_position",
"Avg.Speed km/h": "speed_kmh",
"Age graded performance": "AGP"}
content = []
idx = 0
for fname in listdir(root):
year = splitext(fname)[0].split("_")[1] # Filenames look like 'swd50_2013.csv'
df = pd.read_csv(join(root, fname))
df["year"] = [year] * len(df)
df.rename(columns=headers, inplace=True)
df["position"] = df.index
# Make sure the final index is unique
df.index = df.index + idx
idx += len(df.index)
content.append(df)
# Add some additional useful columns
df = reduce(pd.DataFrame.append, content)
df["age"] = map(lambda x: datetime.today().year - x, df.YOB)
df["hours"] = map(duration2hours, df.time)
df["mph"] = map(partial(speed, distance=50), df.hours)
df["kph"] = map(partial(speed, distance=50 * 1.6), df.hours)
df["pace_mpm"] = map(speed2pace, df.mph)
df["pace_mpk"] = map(speed2pace, df.kph)
df["mins_per_mile"] = map(fmt_mins_per_unit, df.pace_mpm)
df["mins_per_k"] = map(fmt_mins_per_unit, df.pace_mpk)
return df
data = load_all()
genders = ["M", "F"]
colours = {"Overall": "orange", "M": "blue", "F": "pink"}
# Save a handy lookup of the data too
views = OrderedDict((("Overall", data),
("Overall M", data[data.gender == "M"]),
("Overall F", data[data.gender == "F"])))
for y in years:
views["{0}".format(y)] = data[data.year == y]
for g in genders:
views["{0} {1}".format(y, g)] = data[(data.year == y) & (data.gender == g)]
# Print Summary
for g in genders:
print "{0} Winners".format(g)
winners = data[(data.gender_position == 1) & (data.gender == g)]
print winners[["year", "age", "full_name", "country", "time", "mins_per_mile", "mins_per_k"]]
print
# Print pace distribution
ax = plt.subplot(111, title="Pace histogram of all finishers")
ax.hist(data.pace_mpm, color="orange", label="Overall (mean {0} m/m)".format(fmt_mins_per_unit(data.pace_mpm.mean())))
for g in genders:
subset = views["Overall {0}".format(g)]
ax.hist(subset.pace_mpm, alpha=1, color=colours[g], label="{0} (mean {1} m/m)".format(g, fmt_mins_per_unit(subset.pace_mpm.mean())))
ax.legend(bbox_to_anchor=(1.5, 0.5), loc='lower center', ncol=1)
plt.show()
# Print top 10 pace
tenth = data[data.position == 10]
print "Top 10, pace must be faster than {0} m/m".format(fmt_mins_per_unit(tenth.pace_mpm.mean()))
print "Top 10, pace must be faster than {0} m/k".format(fmt_mins_per_unit(tenth.pace_mpk.mean()))
# Print age distribution
ax = plt.subplot(111, title="Age histogram of all finishers")
ax.hist(data.age, color="orange", label="Overall (mean {0:.0f})".format(data.age.mean()))
for g in genders:
subset = views["Overall {0}".format(g)]
ax.hist(subset.age, alpha=1, color=colours[g], label="{0} (mean {1:.0f})".format(g, subset.age.mean()))
ax.legend(bbox_to_anchor=(1.5, 0.5), loc='lower center', ncol=1)
plt.show()