Package nflvid
[frames] | no frames]

Source Code for Package nflvid

  1  """ 
  2  Introduction 
  3  ============ 
  4  A simple library to download, slice and search NFL game footage on a 
  5  play-by-play basis. 
  6   
  7  This library comes with preloaded play-by-play meta data, which describes the 
  8  start time of each play in the game footage. However, the actual footage does 
  9  not come with this library and is not released by me. This package therefore 
 10  provides utilities to batch download NFL Game Footage from the original source. 
 11   
 12  Once game footage is downloaded, you can use this library to search plays and 
 13  construct a playlist to play in any video player. 
 14  """ 
 15   
 16  import gzip 
 17  import math 
 18  import os 
 19  import os.path as path 
 20  import socket 
 21  import sys 
 22  import urllib2 
 23   
 24  import bs4 
 25   
 26  import eventlet 
 27  httplib2 = eventlet.import_patched('httplib2') 
 28  import eventlet.green.subprocess as subprocess 
 29   
 30  from nflgame import OrderedDict 
 31   
 32  _xmlf = path.join(path.split(__file__)[0], 'pbp-xml', '%s-%s.xml.gz') 
 33  _xml_base_url = 'http://e2.cdnl3.neulion.com/nfl/edl/nflgr/%d/%s.xml' 
 34  _coach_url = 'rtmp://neulionms.fcod.llnwd.net/a5306/e1/mp4:' \ 
 35               'u/nfl/nfl/coachtapes/%s/%s_all_1600' 
 36  _coach_url = ( 
 37      'rtmp://neulionms.fcod.llnwd.net', 
 38      'a5306/e1', 
 39      'mp4:u/nfl/nfl/coachtapes/%s/%s_all_1600', 
 40  ) 
 41  _broadcast_url = 'http://nlds82.cdnl3nl.neulion.com/nlds_vod/nfl/vod/' \ 
 42                   '%s/%s/%s/%s/2_%s_%s_%s_%s_h_whole_1_%s.mp4.m3u8' 
 43   
 44  __broadcast_cache = {}  # game eid -> play id -> Play 
 45  __coach_cache = {}  # game eid -> play id -> Play 
 46   
 47   
48 -def _eprint(s):
49 print >> sys.stderr, s
50 51
52 -def broadcast_url(gobj, quality='1600'):
53 """ 54 Returns the HTTP Live Stream URL (an m3u8 file) for the given game 55 and quality. 56 57 Note that this does not work with every game (yet). In particular, 58 URLs vary unpredictably (to me) from game to game. 59 """ 60 month, day = gobj.eid[4:6], gobj.eid[6:8] 61 return _broadcast_url \ 62 % (gobj.season(), month, day, gobj.gamekey, gobj.gamekey, 63 gobj.away.lower(), gobj.home.lower(), gobj.season(), quality)
64 65
66 -def coach_url(gobj):
67 """ 68 Returns the rtmp URL as a triple for the coach footage 69 of the given game. The elemtns of the triple are:: 70 71 (rtmp server, rtmp app name, rtmp playpath) 72 73 Coach video only comes in 1600 quality. 74 """ 75 return ( 76 _coach_url[0], 77 _coach_url[1], 78 _coach_url[2] % (gobj.season(), gobj.gamekey), 79 )
80 81
82 -def footage_full(footage_dir, gobj):
83 """ 84 Returns the path to the full video for a given game inside an nflvid 85 footage directory. 86 87 If the full footage doesn't exist, then None is returned. 88 """ 89 fp = _full_path(footage_dir, gobj) 90 if not os.access(fp, os.R_OK): 91 return None 92 return fp
93 94
95 -def footage_plays(footage_play_dir, gobj):
96 """ 97 Returns a list of all footage broken down by play inside an nflvid 98 footage directory. The list is sorted numerically by play id. 99 100 If no footage breakdown exists for the game provided, then an empty list 101 is returned. 102 """ 103 fp = _play_path(footage_play_dir, gobj) 104 if not os.access(fp, os.R_OK): 105 return [] 106 return sorted(os.listdir(fp), key=lambda s: int(s[0:-4]))
107 108
109 -def footage_play(footage_play_dir, gobj, playid):
110 """ 111 Returns a file path to an existing play slice in the footage play 112 directory for the game and play given. 113 114 If the file for the play is not readable, then None is returned. 115 """ 116 gamedir = _play_path(footage_play_dir, gobj) 117 fp = path.join(gamedir, '%04d.mp4' % int(playid)) 118 if not os.access(fp, os.R_OK): 119 return None 120 return fp
121 122
123 -def _full_path(footage_dir, g):
124 return path.join(footage_dir, '%s-%s.mp4' % (g.eid, g.gamekey))
125 126
127 -def _play_path(footage_play_dir, g):
128 return path.join(footage_play_dir, '%s-%s' % (g.eid, g.gamekey))
129 130
131 -def _nice_game(gobj):
132 return '(Season: %s, Week: %s, %s)' \ 133 % (gobj.schedule['year'], gobj.schedule['week'], gobj)
134 135
136 -def unsliced_plays(footage_play_dir, gobj, coach=True, dry_run=False):
137 """ 138 Scans the game directory inside footage_play_dir and returns a list 139 of plays that haven't been sliced yet. In particular, a play is only 140 considered sliced if the following file is readable, assuming {playid} 141 is its play id:: 142 143 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 144 145 All plays for the game given that don't fit this criteria will be 146 returned in the list. 147 148 If the list is empty, then all plays for the game have been sliced. 149 Alternatively, None can be returned if there was a problem retrieving 150 the play-by-play meta data. 151 152 If coach is False, then play timings for broadcast footage will be 153 used instead of coach timings. 154 155 If dry_run is True, then only the first 10 plays of the game are 156 sliced. 157 """ 158 ps = plays(gobj, coach) 159 outdir = _play_path(footage_play_dir, gobj) 160 161 unsliced = [] 162 if ps is None: 163 return None 164 for i, p in enumerate(ps.values()): 165 if dry_run and i >= 10: 166 break 167 pid = p.idstr() 168 if not os.access(path.join(outdir, '%s.mp4' % pid), os.R_OK): 169 unsliced.append(p) 170 return unsliced
171 172
173 -def slice(footage_play_dir, full_footage_file, gobj, coach=True, 174 threads=4, dry_run=False):
175 """ 176 Uses ffmpeg to slice the given footage file into play-by-play pieces. 177 The full_footage_file should point to a full game downloaded with 178 nflvid-footage and gobj should be the corresponding nflgame.game.Game 179 object. 180 181 The footage_play_dir is where the pieces will be saved:: 182 183 {footage_play_dir}/{eid}-{gamekey}/{playid}.mp4 184 185 This function will not duplicate work. If a video file exists for 186 a particular play, then slice will not regenerate it. 187 188 Note that this function uses an eventlet green pool to run multiple 189 ffmpeg instances simultaneously. The maximum number of threads to 190 use is specified by threads. This function only terminates when all 191 threads have finished processing. 192 193 If coach is False, then play timings for broadcast footage will be 194 used instead of coach timings. 195 196 If dry_run is true, then only the first 10 plays of the game are 197 sliced. 198 """ 199 outdir = _play_path(footage_play_dir, gobj) 200 if not os.access(outdir, os.R_OK): 201 os.makedirs(outdir) 202 203 unsliced = unsliced_plays(footage_play_dir, gobj, coach, dry_run) 204 if unsliced is None or len(unsliced) == 0: 205 _eprint( 206 'There are no unsliced plays remaining for game %s %s.\n' 207 'If they have not been sliced yet, then the XML play-by-play ' 208 'meta data may not be available or is corrupt.' 209 % (gobj, _nice_game(gobj))) 210 return 211 212 pool = eventlet.greenpool.GreenPool(threads) 213 for p in unsliced: 214 pool.spawn_n(slice_play, footage_play_dir, full_footage_file, gobj, p, 215 0, True) 216 pool.waitall() 217 218 _eprint('DONE slicing game %s' % _nice_game(gobj))
219 220
221 -def slice_play(footage_play_dir, full_footage_file, gobj, play, 222 max_duration=0, cut_scoreboard=True):
223 """ 224 This is just like slice, but it only slices the play provided. 225 In typical cases, slice should be used since it makes sure not 226 to duplicate work. 227 228 This function will not check if the play-by-play directory for 229 gobj has been created. 230 231 max_duration is used to cap the length of a play. This drastically 232 cuts down on the time required to slice a game and the storage 233 requirements of a game at the cost of potentially missing bigger 234 plays. This is particularly useful if you are slicing broadcast 235 footage, where imposing a cap at about 15 seconds can decrease 236 storage and CPU requirements by more than half without missing much. 237 238 When cut_scoreboard is True, the first 3.0 seconds of 239 the play will be clipped to remove the scoreboard view. 240 """ 241 outdir = _play_path(footage_play_dir, gobj) 242 st = play.start 243 outpath = path.join(outdir, '%s.mp4' % play.idstr()) 244 245 et = play.end 246 if et is None: # Probably the last play of the game. 247 et = st.add_seconds(40) 248 if max_duration > 0 and (et.seconds() - st.seconds()) > max_duration: 249 et = st.add_seconds(max_duration) 250 251 if cut_scoreboard: 252 st = st.add_seconds(3.0) 253 254 dr = PlayTime(seconds=et.fractional() - st.fractional()) 255 256 start_time = '%02d:%02d:%02d.%d' % (st.hh, st.mm, st.ss, st.milli) 257 duration = '%02d:%02d:%02d.%d' % (dr.hh, dr.mm, dr.ss, dr.milli) 258 cmd = ['ffmpeg', 259 '-ss', start_time, 260 '-i', full_footage_file, 261 '-t', duration, 262 '-map', '0', 263 '-strict', '-2', 264 outpath, 265 ] 266 _run_command(cmd)
267 268
269 -def download_broadcast(footage_dir, gobj, quality='1600', dry_run=False):
270 """ 271 Starts an ffmpeg process to download the full broadcast of the given 272 game with the quality provided. The qualities available are: 273 400, 800, 1200, 1600, 2400, 3000, 4500 with 4500 being the best. 274 275 The footage will be saved to the following path:: 276 277 footage_dir/{eid}-{gamekey}.mp4 278 279 If footage is already at that path, then a LookupError is raised. 280 281 A full game's worth of footage at a quality of 1600 is about 2GB. 282 """ 283 fp = _full_path(footage_dir, gobj) 284 if os.access(fp, os.R_OK): 285 raise LookupError('Footage path "%s" already exists.' % fp) 286 287 url = broadcast_url(gobj, quality) 288 289 # Let's check to see if the URL exists. We could let ffmpeg catch 290 # the error, but since this is a common error, let's show something 291 # nicer than a bunch of ffmpeg vomit. 292 resp, _ = httplib2.Http().request(url, 'HEAD') 293 if resp['status'] != '200': 294 _eprint('BAD URL (http status %s) for game %s: %s' 295 % (resp['status'], _nice_game(gobj), url)) 296 _eprint('FAILED to download game %s' % _nice_game(gobj)) 297 return 298 299 cmd = ['ffmpeg', 300 '-timeout', '60', 301 '-i', url] 302 if dry_run: 303 cmd += ['-t', '30'] 304 cmd += ['-strict', '-2', fp] 305 306 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 307 if not _run_command(cmd): 308 _eprint('FAILED to download game %s' % _nice_game(gobj)) 309 else: 310 _eprint('DONE with game %s' % _nice_game(gobj))
311 312
313 -def download_coach(footage_dir, gobj, dry_run=False):
314 """ 315 Starts an rtmpdump process to download the full coach footage of the 316 given game. Currently, the only quality available is 1600. 317 318 The footage will be saved to the following path:: 319 320 footage_dir/{eid}-{gamekey}.mp4 321 322 If footage is already at that path, then a LookupError is raised. 323 324 A full game's worth of footage at a quality of 1600 is about 1GB. 325 """ 326 fp = _full_path(footage_dir, gobj) 327 if os.access(fp, os.R_OK): 328 raise LookupError('Footage path "%s" already exists.' % fp) 329 330 server, app, path = coach_url(gobj) 331 332 cmd = ['rtmpdump', 333 '--rtmp', server, 334 '--app', app, 335 '--playpath', path, 336 '--timeout', '60', 337 ] 338 if dry_run: 339 cmd += ['--stop', '30'] 340 cmd += ['-o', fp] 341 342 _eprint('Downloading game %s %s' % (gobj.eid, _nice_game(gobj))) 343 status = _run_command(cmd) 344 if status is None: 345 _eprint('DONE (incomplete) with game %s' % _nice_game(gobj)) 346 elif not status: 347 _eprint('FAILED to download game %s' % _nice_game(gobj)) 348 else: 349 _eprint('DONE with game %s' % _nice_game(gobj))
350 351
352 -def _run_command(cmd):
353 try: 354 p = subprocess.Popen(cmd, 355 stdout=subprocess.PIPE, 356 stderr=subprocess.STDOUT) 357 output = p.communicate()[0].strip() 358 359 if p.returncode > 0: 360 err = subprocess.CalledProcessError(p.returncode, cmd) 361 err.output = output 362 raise err 363 except subprocess.CalledProcessError, e: 364 # A hack for rtmpdump... 365 if e.returncode == 2 and cmd[0] == 'rtmpdump': 366 return None 367 indent = lambda s: '\n'.join(map(lambda l: ' %s' % l, s.split('\n'))) 368 _eprint("Could not run '%s' (exit code %d):\n%s" 369 % (' '.join(cmd), e.returncode, indent(e.output))) 370 return False 371 except OSError, e: 372 _eprint("Could not run '%s' (errno: %d): %s" 373 % (' '.join(cmd), e.errno, e.strerror)) 374 return False 375 return True
376 377
378 -def plays(gobj, coach=True):
379 """ 380 Returns an ordered dictionary of all plays for a particular game 381 with timings for the coach footage. If coach is False, then the 382 timings will be for the broadcast footage. 383 384 The game must be a nflgame.game.Game object. 385 386 If there is a problem retrieving the data, None is returned. 387 388 If the game is over, then the XML data is saved to disk. 389 """ 390 if coach: 391 cache = __coach_cache 392 else: 393 cache = __broadcast_cache 394 395 if gobj.game_over() and gobj.eid in cache: 396 return cache[gobj.eid] 397 398 rawxml = _get_xml_data(gobj.eid, gobj.gamekey) 399 ps = _xml_plays(rawxml, coach) 400 if ps is None: 401 return None 402 if len(ps) == 0: 403 _eprint('Could not find timing nodes in XML data, ' 404 'which provide the start time of each play.') 405 return None 406 __broadcast_cache[gobj.eid] = ps 407 408 # Save the XML data to disk if the game is over. 409 if gobj.game_over(): 410 fp = _xmlf % (gobj.eid, gobj.gamekey) 411 try: 412 print >> gzip.open(fp, 'w+'), rawxml, 413 except IOError: 414 _eprint('Could not cache XML data. Please make ' 415 '"%s" writable.' % path.dirname(fp)) 416 return ps
417 418
419 -def play(gobj, playid, coach=True):
420 """ 421 Returns a Play object given a game and a play id with timings for 422 the coach footage. If coach is False, then the timings will be for 423 the broadcast footage. 424 425 The game must be a nflgame.game.Game object. 426 427 If a play with the given id does not exist, None is returned. 428 """ 429 return plays(gobj).get(playid, None)
430 431
432 -class Play (object):
433 """ 434 Represents a single play with meta data that ties it to game footage. 435 The footage_start corresponds to the 'ArchiveTCIN' or 'CATIN', which 436 is when the play starts. Since there is no record of when a play 437 stops, the end is computed by using the start time of the next play. 438 If it's the last play recorded, then the end time is None. 439 440 The play id is the foreign key that maps to play data stored in nflgame. 441 """
442 - def __init__(self, start, end, playid):
443 self.start, self.end, self.playid = start, end, playid
444
445 - def idstr(self):
446 """Returns a string play id padded with zeroes.""" 447 return '%04d' % int(self.playid)
448
449 - def __str__(self):
450 return '(%s, %s, %s)' % (self.playid, self.start, self.end)
451 452
453 -class PlayTime (object):
454 """ 455 Represents a footage time point, in the format HH:MM:SS:MMM where 456 MMM can be either 2 or 3 digits. 457 """
458 - def __init__(self, point=None, seconds=None):
459 """ 460 Construct a PlayTime object given a point in time in the format 461 HH:MM:SS:MMM where MMM can be either 2 or 3 digits. 462 463 Alternatively, seconds can be provided (which may be a float). 464 """ 465 if seconds is not None: 466 milli = int(1000 * (seconds - math.floor(seconds))) 467 468 seconds = int(math.floor(seconds)) 469 hh = seconds / 3600 470 471 seconds -= hh * 3600 472 mm = seconds / 60 473 474 seconds -= mm * 60 475 ss = seconds 476 477 self.hh, self.mm, self.ss, self.milli = hh, mm, ss, milli 478 self.__point = '%02d:%02d:%02d:%03d' % (hh, mm, ss, milli) 479 return 480 481 self.__point = point 482 self.__coach = False 483 484 try: 485 parts = self.__point.split(':') 486 if len(parts[3]) == 3: 487 self.__coach = True 488 parts = map(int, parts) 489 except ValueError: 490 assert False, 'Bad play time format: %s' % self.__point 491 492 if len(parts) != 4: 493 assert False, 'Expected 4 parts but got %d in: %s' \ 494 % (len(parts), self.__point) 495 496 self.hh, self.mm, self.ss, self.milli = parts 497 498 # I believe milliseconds is given in tens of milliseconds 499 # for the ArchiveTCIN node. But the CATIN node (coach timing) 500 # provides regular milliseconds. 501 if not self.__coach: 502 self.milli *= 10
503
504 - def add_seconds(self, seconds):
505 """ 506 Returns a new PlayTime with seconds (int or float) added to self. 507 """ 508 return PlayTime(seconds=self.fractional() + seconds)
509
510 - def seconds(self):
511 """ 512 Returns this time point rounded to the nearest second. 513 """ 514 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 515 if self.milli >= 50: 516 secs += 1 517 return secs
518
519 - def fractional(self):
520 """ 521 Returns this time point as fractional seconds based on milliseconds. 522 """ 523 secs = (self.hh * 60 * 60) + (self.mm * 60) + self.ss 524 secs = (1000 * secs) + self.milli 525 return float(secs) / 1000.0
526
527 - def __cmp__(self, other):
528 return cmp(self.fractional(), other.fractional())
529
530 - def __sub__(self, other):
531 """ 532 Returns the difference rounded to nearest second between 533 two time points. The 'other' time point must take place before the 534 current time point. 535 """ 536 assert other <= self, '%s is not <= than %s' % (other, self) 537 return int(round(self.fractional() - other.fractional()))
538
539 - def __str__(self):
540 return self.__point
541 542
543 -def _xml_plays(data, coach=True):
544 """ 545 Parses the XML raw data given into an ordered dictionary of Play 546 objects corresponding to coach play timings. If coach is set to 547 False, then play timings for the broadcast are retrieved. 548 549 The dictionary is keyed by play id. 550 """ 551 if data is None: 552 return None 553 554 # Load everything into a list first, since we need to look ahead to see 555 # the next play's start time to compute the current play's duration. 556 rows = [] 557 for row in bs4.BeautifulSoup(data).find_all('row'): 558 playid = row.find('id') 559 if not playid: 560 playid = row.get('playid', None) 561 if not playid: 562 continue 563 playid = playid.strip() 564 else: 565 playid = playid.get_text().strip() 566 567 if coach: 568 start = row.find('catin') 569 else: 570 start = row.find('archivetcin') 571 if not start: 572 continue 573 start = PlayTime(start.get_text().strip()) 574 575 # If this start doesn't procede the last start time, skip it. 576 if len(rows) > 0 and start < rows[-1][1]: 577 continue 578 rows.append((playid, start, row)) 579 580 # A predicate for determining whether to ignore a row or not in our final 581 # result set. For example, timeouts take a lot of time but aren't needed 582 # for play-by-play footage. 583 def ignore(row): 584 if 'playdescription' in row.attrs: 585 if row['playdescription'].lower().startswith('timeout'): 586 return True 587 if row['playdescription'].lower().startswith('two-minute'): 588 return True 589 590 # Did we miss anything? 591 if 'preplaybyplay' in row.attrs: 592 if row['preplaybyplay'].lower().startswith('timeout'): 593 return True 594 return False
595 596 d = OrderedDict() 597 for i, (playid, start, row) in enumerate(rows): 598 if ignore(row): 599 continue 600 end = None 601 if i < len(rows) - 1: 602 end = rows[i+1][1] 603 d[playid] = Play(start, end, playid) 604 return d 605 606
607 -def _get_xml_data(eid=None, gamekey=None, fpath=None):
608 """ 609 Returns the XML play data corresponding to the game given. A game must 610 be specified in one of two ways: by providing the eid and gamekey or 611 by providing the file path to a gzipped XML file. 612 613 If the XML data is already on disk, it is read, decompressed and returned. 614 615 Otherwise, the XML data is downloaded from the NFL web site. If the data 616 doesn't exist yet or there was an error, _get_xml_data returns None. 617 """ 618 assert (eid is not None and gamekey is not None) or fpath is not None 619 620 if fpath is not None: 621 return gzip.open(fpath).read() 622 623 fpath = _xmlf % (eid, gamekey) 624 if os.access(fpath, os.R_OK): 625 return gzip.open(fpath).read() 626 try: 627 year = int(eid[0:4]) 628 month = int(eid[4:6]) 629 if month <= 3: 630 year -= 1 631 u = _xml_base_url % (year, gamekey) # The year and the game key. 632 return urllib2.urlopen(u, timeout=10).read() 633 except urllib2.HTTPError, e: 634 _eprint(e) 635 except socket.timeout, e: 636 _eprint(e) 637 return None
638