#!/usr/bin/env python import threading import urllib import os import re import time import getopt import sys import xml.dom.minidom BASE = "http://gd2.mlb.com/components/game/mlb/" OUTPUT = "/home/wells/gameday/" YEAR = None VERBOSE = False try: opts, args = getopt.getopt(sys.argv[1:], 'y:v') except getopt.GetoptError, err: print str(err) sys.exit(1) for opt, arg in opts: if opt in ("-y"): YEAR = int(arg) elif opt in ("-v"): VERBOSE = True if YEAR is None: print "Please do pbp.py -y 2009" raise SystemExit class Handler(threading.Thread): def __init__(self, url, year, month): threading.Thread.__init__(self) self.url = url self.year = year self.month = month def fetch(self, url): for tries in xrange(10): try: page = urllib.urlopen(url) except IOError: time.sleep(1) continue if page.getcode() == 404: return None else: return page.read() break def save(self, url, location): content = self.fetch(url) if not content: return if not os.path.exists(os.path.dirname(location)): os.makedirs(os.path.dirname(location)) file = open(location, 'w') file.write(content) file.close() def regex_save(self, url, regex): content = self.fetch(url) if not content: return for match in re.finditer(r'%s' % regex, content, re.S): file = "%s%s" % (url, match.group(1)) location = "%s%s" % (OUTPUT, file.replace(BASE, '')) self.save(file, location) def run(self): url = "%syear_%4d/month_%02d/" % (self.url, self.year, self.month) content = urllib.urlopen(url) for match in re.finditer(r'href="day_(\d+)/"', content.read(), re.S): day = int(match.group(1)) url = "%syear_%4d/month_%02d/day_%02d/" % (self.url, self.year, self.month, day) html = self.fetch(url) for match in re.finditer(r'', html, re.S): gid = match.group(1) game_url = "%s%s/" % (url, gid) location = "%s%s" % (OUTPUT, game_url.replace(BASE, '')) if os.path.exists(location): if VERBOSE: print "skipping %s (already have)" % gid continue # sometimes gameday has records for future games - dunno why if urllib.urlopen("%sboxscore.xml" % game_url).getcode() != 200: continue # make sure we're working with regular season stuff info = urllib.urlopen("%sgame.xml" % game_url) if info.getcode() != 200: continue g = xml.dom.minidom.parseString(info.read()).getElementsByTagName("game").item(0) if "type" not in g.attributes.keys() or g.attributes["type"].value != "R": if VERBOSE: print "skipping %s (not regular season)" % gid continue if VERBOSE: print "saving game %s" % gid self.save("%sboxscore.xml" % game_url, "%sboxscore.xml" % location) self.save("%splayers.txt" % game_url, "%splayers.txt" % location) self.save("%splayers.xml" % game_url, "%splayers.xml" % location) self.save("%sinning/inning_hit.xml" % game_url, "%sinning/inning_hit.xml" % location) self.regex_save("%s/inning/" % game_url, '') self.regex_save("%sbatters/" % game_url, '') self.regex_save("%spitchers/" % game_url, '') # one second sleep between games time.sleep(1) threads = [] if VERBOSE: print "processing %d..." % YEAR con = urllib.urlopen("%syear_%4d/" % (BASE, YEAR)) if con.getcode() != 200: print "couldn't fetch %syear_%4d/" % (BASE, YEAR) raise SystemExit for month in re.finditer(r'href="month_(\d+)/"', con.read(), re.S): handler = Handler(BASE, YEAR, int(month.group(1))) handler.start() threads.append(handler)