Get Data from HTML Tabular Data.

Get Data from HTML Tabular Data

Given a collection of URLs, scrape the data from specific tables.

Some tables have thead and tbody tags. Others only have rows.

In [1]:
from pathlib import Path
import urllib.request
import urllib.parse
import json
from collections import namedtuple
import pickle
from uuid import uuid4
import itertools as it
import operator as op
import tempfile
from subprocess import check_call
import shlex
from functools import partial
from pprint import pprint
from string import whitespace, punctuation

import pandas as pd
from lxml import etree
from PIL import Image as PILImage
from chamelboots.imageutils import get_scaled_screenshot
from IPython.display import display, Image, HTML
from chamelboots import ChameleonTemplate as CT
from chamelboots import TalStatement as TS
from chamelboots.html.datautils.tables import get_text_from_tr_rows
from chamelboots.constants import Join as JoinWith
In [2]:
head, tail = (op.itemgetter(n) for n in range(2))

Load previously scraped data that has some errors in it.

In [3]:
SportCategory = namedtuple("SportCategory", "sex sport url")
data_page_keys = "category url fp"
DataPage = namedtuple("DataPage", data_page_keys)
DataTree = namedtuple("DataTree", f"{data_page_keys} root tree")
In [4]:
CONTENT = "content"
ATTRIBUTES = "attributes"
TAL_CONTENTS, TAL_ATTRIBUTES = TAL_CONTENTS_WITH_ATTRIBUTES = (
    TS(CONTENT, CONTENT),
    TS(ATTRIBUTES, ATTRIBUTES),
)
TAL_CONTENTS_WITH_STRUCTURE_ATTRIBUTES = (
    TS(CONTENT, f"structure {CONTENT}"),
    TS(ATTRIBUTES, ATTRIBUTES),
)
HTML_PARSER = etree.HTMLParser()
In [5]:
data_pages_pickle = Path("iowa_data_pages.pk")
# data_pages_pickle.unlink()
In [6]:
data_pages = (
    pickle.loads(data_pages_pickle.read_bytes())
    if data_pages_pickle.exists()
    else (DataPage(SportCategory(None, None, None), "", Path(uuid4().hex)),)
)

# The pickled data has tmp files in it. If they have been erased, it reloads the files.
if not all(item.fp.exists() for item in data_pages):
    BASEURL = Path(Path.home(), ".texpander/iowa_sports_stats_json").read_text().strip()

    (data,) = (
        json.loads(Path(f).read_text())
        for f, _ in (urllib.request.urlretrieve(BASEURL),)
    )
    df = pd.DataFrame(data)
    del data
    urls = tuple(set(df.url))
    data_pages = tuple(
        DataPage(
            [
                SportCategory(*(*item[1:][::-1], url))
                for item in df[df["url"] == url]
                .loc[:, "sport":"sex"]
                .drop_duplicates()
                .itertuples()
            ][0],
            url,
            Path(fp),
        )
        for url, (fp, _) in ((url, urllib.request.urlretrieve(url),) for url in urls)
    )
    data_pages_pickle.write_bytes(pickle.dumps(data_pages))
In [7]:
data_pages[:2]
Out[7]:
(DataPage(category=SportCategory(sex='girls', sport='Golf', url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688'), url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688', fp=PosixPath('/tmp/tmpf929_2v6')),
 DataPage(category=SportCategory(sex='boys', sport='Football: IndividualBestGameStandings', url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC'), url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', fp=PosixPath('/tmp/tmpgdi5dux9')))
In [8]:
HTML_PARSER = etree.HTMLParser()
In [9]:
data_trees = tuple(
    DataTree(
        *items,
        *(
            (tree := etree.fromstring(items.fp.read_text(), HTML_PARSER)),
            tree.getroottree(),
        )[::-1],
    )
    for items in data_pages
)
In [10]:
data_trees[:2]
Out[10]:
(DataTree(category=SportCategory(sex='girls', sport='Golf', url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688'), url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688', fp=PosixPath('/tmp/tmpf929_2v6'), root=<lxml.etree._ElementTree object at 0x7f5658f1a230>, tree=<Element html at 0x7f5684106b40>),
 DataTree(category=SportCategory(sex='boys', sport='Football: IndividualBestGameStandings', url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC'), url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', fp=PosixPath('/tmp/tmpgdi5dux9'), root=<lxml.etree._ElementTree object at 0x7f5658f1acd0>, tree=<Element html at 0x7f5658f1af50>))

Check for tables in trees.

In [11]:
tables = {
    item.category: dict(
        zip(
            *(
                (tables := item.tree.xpath("//table")),
                tuple(item.root.getpath(table) for table in tables),
            )[
                ::-1
            ]  # reverse so paths are keys
        )
    )
    for item in data_trees
}
In [12]:
TABLE_XPATH = "/html/body/form/div/div[3]/div[3]/div[2]/table"

Tables with defined headers

In [13]:
with_thead = {
    k: heads[0] for k, v in tables.items() if (heads := v[TABLE_XPATH].xpath("//thead"))
}

Tables without defined headers

In [14]:
no_thead = {
    k: table
    for k, v in tables.items()
    if not (table := v[TABLE_XPATH]).xpath("//thead")
}
In [15]:
TAL_CONTENTS, TAL_ATTRIBUTES = TAL_CONTENTS_WITH_ATTRIBUTES = (
    TS(CONTENT := "content", CONTENT),
    TS(ATTRIBUTES := "attributes", ATTRIBUTES),
)
In [16]:
H2 = CT("h2", (TAL_CONTENTS,)).render
HR = CT("hr", (TAL_ATTRIBUTES,)).render(
    attributes=dict(style="border: .3rem solid green; border-radius: 5px;")
)

COLOREDH3 = partial(  # create a red h2 HTML
    CT("h3", TAL_CONTENTS_WITH_ATTRIBUTES).render,
    attributes={"style": "color: green;"},
)
LINK = CT("a", TAL_CONTENTS_WITH_STRUCTURE_ATTRIBUTES).render

Display menu for jumping to a sport

In [17]:
ALL_TABLES = (*no_thead.items(), *with_thead.items())
In [18]:
PREFIX = "in-list-"
sports_links, to_top_links = zip(
    *(
        (
            LINK(
                attributes=dict(
                    id=(
                        id_ := "".join(
                            (
                                urllib.parse.parse_qs(
                                    urllib.parse.urlsplit(sport_category.url).query
                                )["IDSport"][0],
                                sport_category.sex,
                                "".join(sport_category.sport.split()).replace(":", ""),
                            )
                        )
                    ),
                    href=f"#{PREFIX}{id_}",
                ),
                content=": ".join(sport_category[:2]),
            ),
            LINK(
                attributes=dict(href=f"#{id_}", id=f"{PREFIX}{id_}",),
                content=H2(content="Back to Menu"),
            ),
        )
        for sport_category, table in ALL_TABLES
    )
)

Display a screenshot of the scraped site to see if the headers match.

Display header texts

In [20]:
for top_link, (sport_category, rows) in zip(
    to_top_links,
    it.islice(
        (
            (sport_category, get_text_from_tr_rows(table, (1, 2)),)
            for sport_category, table in ALL_TABLES
        ),
        0,
        len(ALL_TABLES),
    ),
):

    for item in (
        HTML(top_link),
        sport_category,
        HTML(
            LINK(
                attributes=dict(href=sport_category.url, target="_blank"),
                content=H2(content=sport_category.sport),
            )
        ),
        Image(filename=get_scaled_screenshot(sport_category.url)),
        HTML(COLOREDH3(content=head(rows))),
        HTML(HR),
    ):
        display(item)
SportCategory(sex='girls', sport='Golf', url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688')

['Team', '9 Hole Matches', '9 Hole Avg Top 4 Players Total', '18 Hole Matches', '18 Hole Avg Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg Top 4 Players Total Adjusted by Course Rating']


SportCategory(sex='boys', sport='Football: IndividualBestGameStandings', url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC')

['Athlete', 'Team', 'Passing Yards', 'Date', 'Location', 'Opponent']


SportCategory(sex='boys', sport='Spring Golf', url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=FC614ADE-B5DA-4012-A95E-0FD2A594FE9D')

['Team', '9 Hole Matches', '9 Hole Avg Top 4 Players Total', '18 Hole Matches', '18 Hole Avg Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg Top 4 Players Total Adjusted by Course Rating']


SportCategory(sex='girls', sport='Volleyball', url='http://quikstatsiowa.com/Public/Volleyball/TeamStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773')

['Team', 'Record', 'Sets', 'Kills', 'Kill Err', 'Attack Attempts', 'Kill Eff', 'Kills per Set', 'Serve Success', 'Serves', 'Serve Eff', 'Aces', 'Aces per Set']


SportCategory(sex='girls', sport='Tennis: IndividualStandings', url='http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88')

['Athlete', 'Team', 'Match Record', 'Set Record', 'Game Record']


SportCategory(sex='boys', sport='Soccer: IndividualStandings', url='http://quikstatsiowa.com/Public/Soccer/IndividualStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B')

['Athlete', 'Team', 'GP', 'GS', 'G', 'A', 'P', 'Sh', 'Sh %', 'SOG', 'SOG %', 'PKM', 'PKA', 'GM', 'GA', 'GA Avg', 'S', 'S %']


SportCategory(sex='boys', sport='Tennis', url='http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5')

['Team', 'Record', 'Singles Won', 'Singles Lost', 'Doubles Won', 'Doubles Lost']


SportCategory(sex='girls', sport='Bowling', url='http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=0C6DFBCF-98C4-4B01-9F56-17B02E9E47E1')

['Team', 'Record', 'Matches', 'Match Score Avg', 'Individual Game Average', 'Baker Game Average']


SportCategory(sex='boys', sport='Bowling', url='http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=DA3506E8-E4CA-4175-BF69-BEBBDC2FD878')

['Team', 'Record', 'Matches', 'Match Score Avg', 'Individual Game Average', 'Baker Game Average']


SportCategory(sex='boys', sport='Fall Golf', url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=92A34DE4-ACB3-4282-BF29-571A97DE1946')

['Team', '9 Hole Matches', '9 Hole Avg Top 4 Players Total', '18 Hole Matches', '18 Hole Avg Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg Top 4 Players Total Adjusted by Course Rating']


SportCategory(sex='girls', sport='Basketball', url='http://quikstatsiowa.com/Public/Basketball/TeamStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA')