#!/usr/bin/env python
import threading
import urllib
import os
import re
import time
import getopt
import sys
import xml.dom.minidom
BASE = "http://gd2.mlb.com/components/game/mlb/"
OUTPUT = "/home/wells/gameday/"
YEAR = None
VERBOSE = False
try:
opts, args = getopt.getopt(sys.argv[1:], 'y:v')
except getopt.GetoptError, err:
print str(err)
sys.exit(1)
for opt, arg in opts:
if opt in ("-y"):
YEAR = int(arg)
elif opt in ("-v"):
VERBOSE = True
if YEAR is None:
print "Please do pbp.py -y 2009"
raise SystemExit
class Handler(threading.Thread):
def __init__(self, url, year, month):
threading.Thread.__init__(self)
self.url = url
self.year = year
self.month = month
def fetch(self, url):
for tries in xrange(10):
try:
page = urllib.urlopen(url)
except IOError:
time.sleep(1)
continue
if page.getcode() == 404:
return None
else:
return page.read()
break
def save(self, url, location):
content = self.fetch(url)
if not content:
return
if not os.path.exists(os.path.dirname(location)):
os.makedirs(os.path.dirname(location))
file = open(location, 'w')
file.write(content)
file.close()
def regex_save(self, url, regex):
content = self.fetch(url)
if not content:
return
for match in re.finditer(r'%s' % regex, content, re.S):
file = "%s%s" % (url, match.group(1))
location = "%s%s" % (OUTPUT, file.replace(BASE, ''))
self.save(file, location)
def run(self):
url = "%syear_%4d/month_%02d/" % (self.url, self.year, self.month)
content = urllib.urlopen(url)
for match in re.finditer(r'href="day_(\d+)/"', content.read(), re.S):
day = int(match.group(1))
url = "%syear_%4d/month_%02d/day_%02d/" % (self.url, self.year, self.month, day)
html = self.fetch(url)
for match in re.finditer(r'', html, re.S):
gid = match.group(1)
game_url = "%s%s/" % (url, gid)
location = "%s%s" % (OUTPUT, game_url.replace(BASE, ''))
if os.path.exists(location):
if VERBOSE:
print "skipping %s (already have)" % gid
continue
# sometimes gameday has records for future games - dunno why
if urllib.urlopen("%sboxscore.xml" % game_url).getcode() != 200:
continue
# make sure we're working with regular season stuff
info = urllib.urlopen("%sgame.xml" % game_url)
if info.getcode() != 200:
continue
g = xml.dom.minidom.parseString(info.read()).getElementsByTagName("game").item(0)
if "type" not in g.attributes.keys() or g.attributes["type"].value != "R":
if VERBOSE:
print "skipping %s (not regular season)" % gid
continue
if VERBOSE:
print "saving game %s" % gid
self.save("%sboxscore.xml" % game_url, "%sboxscore.xml" % location)
self.save("%splayers.txt" % game_url, "%splayers.txt" % location)
self.save("%splayers.xml" % game_url, "%splayers.xml" % location)
self.save("%sinning/inning_hit.xml" % game_url, "%sinning/inning_hit.xml" % location)
self.regex_save("%s/inning/" % game_url, '')
self.regex_save("%sbatters/" % game_url, '')
self.regex_save("%spitchers/" % game_url, '')
# one second sleep between games
time.sleep(1)
threads = []
if VERBOSE:
print "processing %d..." % YEAR
con = urllib.urlopen("%syear_%4d/" % (BASE, YEAR))
if con.getcode() != 200:
print "couldn't fetch %syear_%4d/" % (BASE, YEAR)
raise SystemExit
for month in re.finditer(r'href="month_(\d+)/"', con.read(), re.S):
handler = Handler(BASE, YEAR, int(month.group(1)))
handler.start()
threads.append(handler)