1 """
2 Introduction
3 ============
4 A simple library to download, slice and search NFL game footage on a
5 play-by-play basis.
6
7 This library comes with preloaded play-by-play meta data, which describes the
8 start time of each play in the game footage. However, the actual footage does
9 not come with this library and is not released by me. This package therefore
10 provides utilities to batch download NFL Game Footage from the original source.
11
12 Once game footage is downloaded, you can use this library to search plays and
13 construct a playlist to play in any video player.
14 """
15
16 import gzip
17 import math
18 import os
19 import os.path as path
20 import socket
21 import sys
22 import urllib2
23
24 import bs4
25
26 import eventlet
27 httplib2 = eventlet.import_patched('httplib2')
28 import eventlet.green.subprocess as subprocess
29
30 from nflgame import OrderedDict
31
32 _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz')
33 _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml'
34 _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \
35 'u/nfl/nfl/coachtapes/%s/%s_all_1600'
36 _coach_url = (
37 'rtmp://neulionms.fcod.llnwd.net',
38 'a5306/e1',
39 'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600',
40 )
41 _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \
42 '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8'
43
44 __broadcast_cache = {}
45 __coach_cache = {}
46
47
49 print >> sys.stderr, s
50
51
53 """
54 Returns the HTTP Live Stream URL (an m3u8 file) for the given game
55 and quality.
56
57 Note that this does not work with every game (yet). In particular,
58 URLs vary unpredictably (to me) from game to game.
59 """
60 month, day = gobj.eid[4:6], gobj.eid[6:8]
61 return _broadcast_url \
62 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey,
63 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
64
65
67 """
68 Returns the rtmp URL as a triple for the coach footage
69 of the given game. The elemtns of the triple are::
70
71 (rtmp server, rtmp app name, rtmp playpath)
72
73 Coach video only comes in 1600 quality.
74 """
75 return (
76 _coach_url[0],
77 _coach_url[1],
78 _coach_url[2] % (gobj.season(), gobj.gamekey),
79 )
80
81
93
94
107
108
121
122
124 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
125
126
128 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
129
130
132 return '(Season: %s, Week: %s, %s)' \
133 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
134
135
136 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
137 """
138 Scans the game directory inside footage_play_dir and returns a list
139 of plays that haven't been sliced yet. In particular, a play is only
140 considered sliced if the following file is readable, assuming {playid}
141 is its play id::
142
143 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
144
145 All plays for the game given that don't fit this criteria will be
146 returned in the list.
147
148 If the list is empty, then all plays for the game have been sliced.
149 Alternatively, None can be returned if there was a problem retrieving
150 the play-by-play meta data.
151
152 If coach is False, then play timings for broadcast footage will be
153 used instead of coach timings.
154
155 If dry_run is True, then only the first 10 plays of the game are
156 sliced.
157 """
158 ps = plays(gobj, coach)
159 outdir = _play_path(footage_play_dir, gobj)
160
161 unsliced = []
162 if ps is None:
163 return None
164 for i, p in enumerate(ps.values()):
165 if dry_run and i >= 10:
166 break
167 pid = p.idstr()
168 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK):
169 unsliced.append(p)
170 return unsliced
171
172
173 -def slice(footage_play_dir, full_footage_file, gobj, coach=True,
174 threads=4, dry_run=False):
175 """
176 Uses ffmpeg to slice the given footage file into play-by-play pieces.
177 The full_footage_file should point to a full game downloaded with
178 nflvid-footage and gobj should be the corresponding nflgame.game.Game
179 object.
180
181 The footage_play_dir is where the pieces will be saved::
182
183 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4
184
185 This function will not duplicate work. If a video file exists for
186 a particular play, then slice will not regenerate it.
187
188 Note that this function uses an eventlet green pool to run multiple
189 ffmpeg instances simultaneously. The maximum number of threads to
190 use is specified by threads. This function only terminates when all
191 threads have finished processing.
192
193 If coach is False, then play timings for broadcast footage will be
194 used instead of coach timings.
195
196 If dry_run is true, then only the first 10 plays of the game are
197 sliced.
198 """
199 outdir = _play_path(footage_play_dir, gobj)
200 if not os.access(outdir, os.R_OK):
201 os.makedirs(outdir)
202
203 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run)
204 if unsliced is None or len(unsliced) == 0:
205 _eprint(
206 'There are no unsliced plays remaining for game %s %s.\n'
207 'If they have not been sliced yet, then the XML play-by-play '
208 'meta data may not be available or is corrupt.'
209 % (gobj, _nice_game(gobj)))
210 return
211
212 pool = eventlet.greenpool.GreenPool(threads)
213 for p in unsliced:
214 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p,
215 0, True)
216 pool.waitall()
217
218 _eprint('DONE slicing game %s' % _nice_game(gobj))
219
220
221 -def slice_play(footage_play_dir, full_footage_file, gobj, play,
222 max_duration=0, cut_scoreboard=True):
223 """
224 This is just like slice, but it only slices the play provided.
225 In typical cases, slice should be used since it makes sure not
226 to duplicate work.
227
228 This function will not check if the play-by-play directory for
229 gobj has been created.
230
231 max_duration is used to cap the length of a play. This drastically
232 cuts down on the time required to slice a game and the storage
233 requirements of a game at the cost of potentially missing bigger
234 plays. This is particularly useful if you are slicing broadcast
235 footage, where imposing a cap at about 15 seconds can decrease
236 storage and CPU requirements by more than half without missing much.
237
238 When cut_scoreboard is True, the first 3.0 seconds of
239 the play will be clipped to remove the scoreboard view.
240 """
241 outdir = _play_path(footage_play_dir, gobj)
242 st = play.start
243 outpath = path.join(outdir, '%s.mp4' % play.idstr())
244
245 et = play.end
246 if et is None:
247 et = st.add_seconds(40)
248 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration:
249 et = st.add_seconds(max_duration)
250
251 if cut_scoreboard:
252 st = st.add_seconds(3.0)
253
254 dr = PlayTime(seconds=et.fractional() - st.fractional())
255
256 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli)
257 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli)
258 cmd = ['ffmpeg',
259 '-ss', start_time,
260 '-i', full_footage_file,
261 '-t', duration,
262 '-map', '0',
263 '-strict', '-2',
264 outpath,
265 ]
266 _run_command(cmd)
267
268
270 """
271 Starts an ffmpeg process to download the full broadcast of the given
272 game with the quality provided. The qualities available are:
273 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best.
274
275 The footage will be saved to the following path::
276
277 footage_dir/{eid}-{gamekey}.mp4
278
279 If footage is already at that path, then a LookupError is raised.
280
281 A full game's worth of footage at a quality of 1600 is about 2GB.
282 """
283 fp = _full_path(footage_dir, gobj)
284 if os.access(fp, os.R_OK):
285 raise LookupError('Footage path "%s" already exists.' % fp)
286
287 url = broadcast_url(gobj, quality)
288
289
290
291
292 resp, _ = httplib2.Http().request(url, 'HEAD')
293 if resp['status'] != '200':
294 _eprint('BAD URL (http status %s) for game %s: %s'
295 % (resp['status'], _nice_game(gobj), url))
296 _eprint('FAILED to download game %s' % _nice_game(gobj))
297 return
298
299 cmd = ['ffmpeg',
300 '-timeout', '60',
301 '-i', url]
302 if dry_run:
303 cmd += ['-t', '30']
304 cmd += ['-strict', '-2', fp]
305
306 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
307 if not _run_command(cmd):
308 _eprint('FAILED to download game %s' % _nice_game(gobj))
309 else:
310 _eprint('DONE with game %s' % _nice_game(gobj))
311
312
314 """
315 Starts an rtmpdump process to download the full coach footage of the
316 given game. Currently, the only quality available is 1600.
317
318 The footage will be saved to the following path::
319
320 footage_dir/{eid}-{gamekey}.mp4
321
322 If footage is already at that path, then a LookupError is raised.
323
324 A full game's worth of footage at a quality of 1600 is about 1GB.
325 """
326 fp = _full_path(footage_dir, gobj)
327 if os.access(fp, os.R_OK):
328 raise LookupError('Footage path "%s" already exists.' % fp)
329
330 server, app, path = coach_url(gobj)
331
332 cmd = ['rtmpdump',
333 '--rtmp', server,
334 '--app', app,
335 '--playpath', path,
336 '--timeout', '60',
337 ]
338 if dry_run:
339 cmd += ['--stop', '30']
340 cmd += ['-o', fp]
341
342 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj)))
343 status = _run_command(cmd)
344 if status is None:
345 _eprint('DONE (incomplete) with game %s' % _nice_game(gobj))
346 elif not status:
347 _eprint('FAILED to download game %s' % _nice_game(gobj))
348 else:
349 _eprint('DONE with game %s' % _nice_game(gobj))
350
351
353 try:
354 p = subprocess.Popen(cmd,
355 stdout=subprocess.PIPE,
356 stderr=subprocess.STDOUT)
357 output = p.communicate()[0].strip()
358
359 if p.returncode > 0:
360 err = subprocess.CalledProcessError(p.returncode, cmd)
361 err.output = output
362 raise err
363 except subprocess.CalledProcessError, e:
364
365 if e.returncode == 2 and cmd[0] == 'rtmpdump':
366 return None
367 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n')))
368 _eprint("Could not run '%s' (exit code %d):\n%s"
369 % (' '.join(cmd), e.returncode, indent(e.output)))
370 return False
371 except OSError, e:
372 _eprint("Could not run '%s' (errno: %d): %s"
373 % (' '.join(cmd), e.errno, e.strerror))
374 return False
375 return True
376
377
378 -def plays(gobj, coach=True):
379 """
380 Returns an ordered dictionary of all plays for a particular game
381 with timings for the coach footage. If coach is False, then the
382 timings will be for the broadcast footage.
383
384 The game must be a nflgame.game.Game object.
385
386 If there is a problem retrieving the data, None is returned.
387
388 If the game is over, then the XML data is saved to disk.
389 """
390 if coach:
391 cache = __coach_cache
392 else:
393 cache = __broadcast_cache
394
395 if gobj.game_over() and gobj.eid in cache:
396 return cache[gobj.eid]
397
398 rawxml = _get_xml_data(gobj.eid, gobj.gamekey)
399 ps = _xml_plays(rawxml, coach)
400 if ps is None:
401 return None
402 if len(ps) == 0:
403 _eprint('Could not find timing nodes in XML data, '
404 'which provide the start time of each play.')
405 return None
406 __broadcast_cache[gobj.eid] = ps
407
408
409 if gobj.game_over():
410 fp = _xmlf % (gobj.eid, gobj.gamekey)
411 try:
412 print >> gzip.open(fp, 'w+'), rawxml,
413 except IOError:
414 _eprint('Could not cache XML data. Please make '
415 '"%s" writable.' % path.dirname(fp))
416 return ps
417
418
419 -def play(gobj, playid, coach=True):
420 """
421 Returns a Play object given a game and a play id with timings for
422 the coach footage. If coach is False, then the timings will be for
423 the broadcast footage.
424
425 The game must be a nflgame.game.Game object.
426
427 If a play with the given id does not exist, None is returned.
428 """
429 return plays(gobj).get(playid, None)
430
431
432 -class Play (object):
433 """
434 Represents a single play with meta data that ties it to game footage.
435 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which
436 is when the play starts. Since there is no record of when a play
437 stops, the end is computed by using the start time of the next play.
438 If it's the last play recorded, then the end time is None.
439
440 The play id is the foreign key that maps to play data stored in nflgame.
441 """
442 - def __init__(self, start, end, playid):
443 self.start, self.end, self.playid = start, end, playid
444
446 """Returns a string play id padded with zeroes."""
447 return '%04d' % int(self.playid)
448
450 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
451
452
454 """
455 Represents a footage time point, in the format HH:MM:SS:MMM where
456 MMM can be either 2 or 3 digits.
457 """
458 - def __init__(self, point=None, seconds=None):
459 """
460 Construct a PlayTime object given a point in time in the format
461 HH:MM:SS:MMM where MMM can be either 2 or 3 digits.
462
463 Alternatively, seconds can be provided (which may be a float).
464 """
465 if seconds is not None:
466 milli = int(1000 * (seconds - math.floor(seconds)))
467
468 seconds = int(math.floor(seconds))
469 hh = seconds / 3600
470
471 seconds -= hh * 3600
472 mm = seconds / 60
473
474 seconds -= mm * 60
475 ss = seconds
476
477 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli
478 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli)
479 return
480
481 self.__point = point
482 self.__coach = False
483
484 try:
485 parts = self.__point.split(':')
486 if len(parts[3]) == 3:
487 self.__coach = True
488 parts = map(int, parts)
489 except ValueError:
490 assert False, 'Bad play time format: %s' % self.__point
491
492 if len(parts) != 4:
493 assert False, 'Expected 4 parts but got %d in: %s' \
494 % (len(parts), self.__point)
495
496 self.hh, self.mm, self.ss, self.milli = parts
497
498
499
500
501 if not self.__coach:
502 self.milli *= 10
503
509
511 """
512 Returns this time point rounded to the nearest second.
513 """
514 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
515 if self.milli >= 50:
516 secs += 1
517 return secs
518
520 """
521 Returns this time point as fractional seconds based on milliseconds.
522 """
523 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss
524 secs = (1000 * secs) + self.milli
525 return float(secs) / 1000.0
526
529
531 """
532 Returns the difference rounded to nearest second between
533 two time points. The 'other' time point must take place before the
534 current time point.
535 """
536 assert other <= self, '%s is not <= than %s' % (other, self)
537 return int(round(self.fractional() - other.fractional()))
538
541
542
544 """
545 Parses the XML raw data given into an ordered dictionary of Play
546 objects corresponding to coach play timings. If coach is set to
547 False, then play timings for the broadcast are retrieved.
548
549 The dictionary is keyed by play id.
550 """
551 if data is None:
552 return None
553
554
555
556 rows = []
557 for row in bs4.BeautifulSoup(data).find_all('row'):
558 playid = row.find('id')
559 if not playid:
560 playid = row.get('playid', None)
561 if not playid:
562 continue
563 playid = playid.strip()
564 else:
565 playid = playid.get_text().strip()
566
567 if coach:
568 start = row.find('catin')
569 else:
570 start = row.find('archivetcin')
571 if not start:
572 continue
573 start = PlayTime(start.get_text().strip())
574
575
576 if len(rows) > 0 and start < rows[-1][1]:
577 continue
578 rows.append((playid, start, row))
579
580
581
582
583 def ignore(row):
584 if 'playdescription' in row.attrs:
585 if row['playdescription'].lower().startswith('timeout'):
586 return True
587 if row['playdescription'].lower().startswith('two-minute'):
588 return True
589
590
591 if 'preplaybyplay' in row.attrs:
592 if row['preplaybyplay'].lower().startswith('timeout'):
593 return True
594 return False
595
596 d = OrderedDict()
597 for i, (playid, start, row) in enumerate(rows):
598 if ignore(row):
599 continue
600 end = None
601 if i < len(rows) - 1:
602 end = rows[i+1][1]
603 d[playid] = Play(start, end, playid)
604 return d
605
606
608 """
609 Returns the XML play data corresponding to the game given. A game must
610 be specified in one of two ways: by providing the eid and gamekey or
611 by providing the file path to a gzipped XML file.
612
613 If the XML data is already on disk, it is read, decompressed and returned.
614
615 Otherwise, the XML data is downloaded from the NFL web site. If the data
616 doesn't exist yet or there was an error, _get_xml_data returns None.
617 """
618 assert (eid is not None and gamekey is not None) or fpath is not None
619
620 if fpath is not None:
621 return gzip.open(fpath).read()
622
623 fpath = _xmlf % (eid, gamekey)
624 if os.access(fpath, os.R_OK):
625 return gzip.open(fpath).read()
626 try:
627 year = int(eid[0:4])
628 month = int(eid[4:6])
629 if month <= 3:
630 year -= 1
631 u = _xml_base_url % (year, gamekey)
632 return urllib2.urlopen(u, timeout=10).read()
633 except urllib2.HTTPError, e:
634 _eprint(e)
635 except socket.timeout, e:
636 _eprint(e)
637 return None
638