Data Science Portfolio

Scrape Fanduel NBA Player Stats with Scrapy

Fanduel player stats archive:

http://rotoguru1.com/cgi-bin/hyday.pl?mon=4&day=10&year=2019&game=fd

Overview

Import libraries

import csv
import time
import string
import requests
import json
import numpy
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

Begin creating our scraper

# Create a new spider class
class DFSpider(scrapy.Spider):
    # Name our spider
    name = "DFS"
    
    # URL(s) to start with.
    start_urls = [
        'http://rotoguru1.com/cgi-bin/hyday.pl?mon=4&day=10&year=2019&game=fd'
    ]

Define the parse method

# Create a new spider class
class DFSpider(scrapy.Spider):

    # ...

    # Use XPath to parse the response we get
    def parse(self, response):
        # Get the table rows
        trs = response.xpath('/html/body/table[1]//table[@cellspacing=5]//tr')
            
        # Remove first two rows since they don't contain any player data
        trs = trs[2:]

Iterate across table rows and extract player stats

# Create a new spider class
class DFSpider(scrapy.Spider):

    # ...

    # Parse the response with XPath
    def parse(self, response):

        # ...
        
        # Check if table rows exist
        if trs:        
            # Iterate over each row
            for tr in trs:
                # Yield a dictionary with our desired values
                yield {
                    # Extract each player's stats here
                    'Name': tr.xpath('./td[2]/a/text()').extract(),
                    'Position': tr.xpath('./td[1]/text()').extract(),
                    'FD Pts': tr.xpath('./td[3]/text()').extract(),
                    'FD Salary': tr.xpath('./td[4]/text()').extract(),
                    'Team': tr.xpath('./td[5]/text()').extract(),
                    'Opp': tr.xpath('./td[6]/text()').extract(),
                    'Score': tr.xpath('./td[7]/text()').extract(),
                    'Min': tr.xpath('./td[8]/text()').extract(),
                    'Stats': tr.xpath('./td[9]/text()').extract()
                }
# Create a new spider class
class DFSpider(scrapy.Spider):

    # ...

    # Parse the response with XPath
    def parse(self, response):

        # ...
        
        # ...

        # Select the table containing the next page link
        table = response.xpath('//table[@border=0]')[6]

        # Get the next page link
        next_page = table.xpath('./tr[1]/td[1]/a/@href').extract_first()

        # Recursively call the parse function on the next page link
        if next_page is not None:
            print('Page completed. Going to next page.')
            yield scrapy.Request(next_page, callback=self.parse)

Pass in desired settings and start the crawler

# Create a new spider class
class DFSpider(scrapy.Spider):

    # ...

    # Parse the response with XPath
    def parse(self, response):

        # ...
        
        # ...

        # ...

# Pass in settings
process = CrawlerProcess({
    'FEED_FORMAT': 'json',                 # Save our data as json
    'FEED_URI': 'nba_fanduel_stats.json',  # Specify the json output file
    'DEPTH_LIMIT': 3,                      # Only traverse three links
    'DOWNLOAD_DELAY': 0.50,                # Set a delay of 0.5 seconds
    'LOG_ENABLED': False                   # For debugging, change this to true
})

Our complete scrapy crawler

import csv
import time
import string
import requests
import json
import numpy
import pandas as pd
import scrapy
from scrapy.crawler import CrawlerProcess

# Create a new spider class
class RotoSpider(scrapy.Spider):
    # Name our spider
    name = "RS"
    
    # URL(s) to start with.
    start_urls = [
        'http://rotoguru1.com/cgi-bin/hyday.pl?mon=4&day=10&year=2019&game=fd'
    ]
    
    # Use XPath to parse the response we get.
    def parse(self, response):
        # Get the table rows
        trs = response.xpath('/html/body/table[1]//table[@cellspacing=5]//tr')
            
        # Remove first two rows since they don't contain any player data
        trs = trs[2:]   
        
        # Iterate over every element on the page
        if trs:
            for tr in trs:
                # Yield a dictionary with the values we want
                yield {
                    'Name': tr.xpath('./td[2]/a/text()').extract(),
                    'Position': tr.xpath('./td[1]/text()').extract(),
                    'FD Pts': tr.xpath('./td[3]/text()').extract(),
                    'FD Salary': tr.xpath('./td[4]/text()').extract(),
                    'Team': tr.xpath('./td[5]/text()').extract(),
                    'Opp': tr.xpath('./td[6]/text()').extract(),
                    'Score': tr.xpath('./td[7]/text()').extract(),
                    'Min': tr.xpath('./td[8]/text()').extract(),
                    'Stats': tr.xpath('./td[9]/text()').extract()
                }
                
        # Select the table containing the next page link
        table = response.xpath('//table[@border=0]')[6]
        
        # Get the next page link
        next_page = table.xpath('./tr[1]/td[1]/a/@href').extract_first()
        
        # Run the parse function recursively on the next page link
        if next_page is not None:
            print('Page completed. Going to next page.')
            yield scrapy.Request(next_page, callback=self.parse)
        
# Pass in settings
process = CrawlerProcess({
    'FEED_FORMAT': 'json',                 # Save our data as json
    'FEED_URI': 'nba_fanduel_stats.json',  # Specify the json output file
    'DEPTH_LIMIT': 3,                      # Only traverse three links
    'DOWNLOAD_DELAY': 0.50,                # Set a delay of 0.5 seconds
    'LOG_ENABLED': False                   # For debugging, change this to true
})

Run the crawler

# Start the crawler
process.crawl(RotoSpider)
process.start()
print('Scraping completed.')
Page completed. Going to next page.
Page completed. Going to next page.
Page completed. Going to next page.
Page completed. Going to next page.
Scraping completed.

Import JSON data into new pandas dataframe

nba_fanduel_stats = pd.read_json('nba_fanduel_stats.json', orient='records')
print('Number of rows: {}'.format(nba_fanduel_stats.shape[0]))
print(nba_fanduel_stats.head())
Number of rows: 1189
   FD Pts  FD Salary      Min                  Name      Opp Position  \
0  [58.7]   [$3,500]  [48:00]    [Simons, Anfernee]  [v sac]     [SG]   
1  [57.4]   [$3,500]  [40:43]      [Allen, Grayson]  [@ lac]     [SG]   
2  [56.9]   [$9,800]  [40:09]       [Walker, Kemba]  [v orl]     [PG]   
3  [55.7]   [$3,600]  [48:00]        [Frazier, Tim]  [v okc]     [PG]   
4  [55.7]  [$12,100]  [36:28]  [Westbrook, Russell]  [@ mil]     [PG]   

        Score                                              Stats   Team  
0  [ 136-131]      [   37pt 6rb 9as 1st 2to 7trey 13-21fg 4-6ft]  [por]  
1  [ 137-143]  [   40pt 7rb 4as 1st 1bl 3to 5trey 11-30fg 13-...  [uta]  
2  [ 114-122]      [   43pt 2rb 5as 2bl 2to 4trey 16-25fg 7-7ft]  [cha]  
3  [ 116-127]     [   29pt 6rb 13as 1bl 3to 4trey 10-23fg 5-7ft]  [mil]  
4  [ 127-116]  [   15pt 11rb 17as 1st 1bl 4to 1trey 7-10fg 0-...  [okc]