South Downs Way 50 – Past results analysis with python


My first race since the Spine is coming up, and whilst I’m not exactly at peak fitness, I thought it would be interesting to have a look at past results, to get an idea of how long the typical person takes.

If nothing else, it’s an excuse to play around with the tools I normally use at work, but with something less dry than financial time series.

I grabbed the raw data from the mighty D-U-V Statistik, the full code is at the end of the post.

Quick Check

Here’s a quick look at the data, to make sure it looks sensible:

M Winners
     year  age       full_name country        time mins_per_mile mins_per_k
1    2013   35   Perkins, Mark    GBR   6:55:37 h           8:19       5:12
125  2014   30   Navesey, Paul    GBR   6:11:28 h           7:26       4:39
426  2015   22   Mound, Victor    GBR   5:53:19 h           7:04       4:25
F Winners
     year  age        full_name country        time mins_per_mile mins_per_k
8    2013   38    Canvin, Emily    GBR   8:23:30 h          10:04       6:18
133  2014   37   Sutton, Edwina    GBR   7:09:21 h           8:35       5:22
432  2015   33   Morwood, Sarah    GBR   7:19:03 h           8:47       5:29

So far, not all that interesting (but impressive), although seeing the top 3 in terms of average pace is quite illuminating. Victor ran almost the equivalent of two back to back 3 hour marathons, over a hilly trail route.

Lets look at some histograms.

Pace Distribution


Faster than walking pace, but nothing too crazy.

What does it take to get a top 10 spot I wonder?

Top 10, pace must be faster than 9:20 m/m
Top 10, pace must be faster than 5:50 m/k

Pretty brisk then, considering the South Downs Way has some steep bits!

Age Distribution

To finish off, lets have a quick look at the age distribution, mostly because I’m feeling old (turn 40 this year), and I’m hoping that at least I’ll be able to use that as an excuse for being slow.


OH, that was’t what I expected…bit late to do any more training, oh well.

For The Nerds

# Stick all the imports in one place, for clarity
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd

from collections import defaultdict, OrderedDict
from os.path import join, splitext
from os import listdir
from datetime import datetime, timedelta
from functools import partial

def duration2hours(t):
 """Convert a duration string (eg '6:55:37 h') to a number of hours"""
 ts = t.split(" ")[0].split(":")
 td = timedelta(hours=int(ts[0]), minutes=int(ts[1]), seconds=int(ts[2]))
 return td.seconds / float(60 * 60)

def speed(hours=None, distance=None):
 return distance / hours

def fmt_mins_per_unit(mins_per_unit):
 """Returns a recognisable pace string, eg 6:55"""
 return "{0}:{1:0>2.0f}".format(int(mins_per_unit), 60 * (mins_per_unit - int(mins_per_unit)))
def speed2pace(speed):
 mins_per_unit = 60 / speed
 return mins_per_unit

def load_all():
 root = "race_data"
 # Use our own headers, that are easier to refer to in code
 headers = {"Rank": "position", 
 "Performance": "time", 
 "Club": "club", 
 "Nat": "country", 
 "M/F": "gender", 
 "Rank M/F": "gender_position", 
 "Cat": "category", 
 "Cat Rank": "category_position", 
 "Avg.Speed km/h": "speed_kmh", 
 "Age graded performance": "AGP"}
 content = []
 idx = 0
 for fname in listdir(root):
 year = splitext(fname)[0].split("_")[1] # Filenames look like 'swd50_2013.csv'
 df = pd.read_csv(join(root, fname))
 df["year"] = [year] * len(df)
 df.rename(columns=headers, inplace=True)
 df["position"] = df.index
 # Make sure the final index is unique
 df.index = df.index + idx
 idx += len(df.index)
 # Add some additional useful columns
 df = reduce(pd.DataFrame.append, content)
 df["age"] = map(lambda x: - x, df.YOB)
 df["hours"] = map(duration2hours, df.time)
 df["mph"] = map(partial(speed, distance=50), df.hours)
 df["kph"] = map(partial(speed, distance=50 * 1.6), df.hours)
 df["pace_mpm"] = map(speed2pace, df.mph)
 df["pace_mpk"] = map(speed2pace, df.kph)
 df["mins_per_mile"] = map(fmt_mins_per_unit, df.pace_mpm)
 df["mins_per_k"] = map(fmt_mins_per_unit, df.pace_mpk)
 return df
data = load_all()

genders = ["M", "F"]
colours = {"Overall": "orange", "M": "blue", "F": "pink"}

# Save a handy lookup of the data too
views = OrderedDict((("Overall", data), 
 ("Overall M", data[data.gender == "M"]),
 ("Overall F", data[data.gender == "F"])))
for y in years:
 views["{0}".format(y)] = data[data.year == y]
 for g in genders:
 views["{0} {1}".format(y, g)] = data[(data.year == y) & (data.gender == g)]

# Print Summary
for g in genders:
 print "{0} Winners".format(g)
 winners = data[(data.gender_position == 1) & (data.gender == g)]
 print winners[["year", "age", "full_name", "country", "time", "mins_per_mile", "mins_per_k"]]

# Print pace distribution
ax = plt.subplot(111, title="Pace histogram of all finishers")
ax.hist(data.pace_mpm, color="orange", label="Overall (mean {0} m/m)".format(fmt_mins_per_unit(data.pace_mpm.mean())))
for g in genders:
 subset = views["Overall {0}".format(g)] 
 ax.hist(subset.pace_mpm, alpha=1, color=colours[g], label="{0} (mean {1} m/m)".format(g, fmt_mins_per_unit(subset.pace_mpm.mean())))
ax.legend(bbox_to_anchor=(1.5, 0.5), loc='lower center', ncol=1)

# Print top 10 pace
tenth = data[data.position == 10]
print "Top 10, pace must be faster than {0} m/m".format(fmt_mins_per_unit(tenth.pace_mpm.mean()))
print "Top 10, pace must be faster than {0} m/k".format(fmt_mins_per_unit(tenth.pace_mpk.mean()))

# Print age distribution
ax = plt.subplot(111, title="Age histogram of all finishers")
ax.hist(data.age, color="orange", label="Overall (mean {0:.0f})".format(data.age.mean()))
for g in genders:
 subset = views["Overall {0}".format(g)] 
 ax.hist(subset.age, alpha=1, color=colours[g], label="{0} (mean {1:.0f})".format(g, subset.age.mean()))
ax.legend(bbox_to_anchor=(1.5, 0.5), loc='lower center', ncol=1)