Get Data from HTML Tabular Data.
Get Data from HTML Tabular Data¶
Given a collection of URLs, scrape the data from specific tables.
Some tables have thead
and tbody
tags. Others only have rows.
In [1]:
from pathlib import Path
import urllib.request
import urllib.parse
import json
from collections import namedtuple
import pickle
from uuid import uuid4
import itertools as it
import operator as op
import tempfile
from subprocess import check_call
import shlex
from functools import partial
from pprint import pprint
from string import whitespace, punctuation
import pandas as pd
from lxml import etree
from PIL import Image as PILImage
from chamelboots.imageutils import get_scaled_screenshot
from IPython.display import display, Image, HTML
from chamelboots import ChameleonTemplate as CT
from chamelboots import TalStatement as TS
from chamelboots.html.datautils.tables import get_text_from_tr_rows
from chamelboots.constants import Join as JoinWith
In [2]:
head, tail = (op.itemgetter(n) for n in range(2))
Load previously scraped data that has some errors in it.¶
In [3]:
SportCategory = namedtuple("SportCategory", "sex sport url")
data_page_keys = "category url fp"
DataPage = namedtuple("DataPage", data_page_keys)
DataTree = namedtuple("DataTree", f"{data_page_keys} root tree")
In [4]:
CONTENT = "content"
ATTRIBUTES = "attributes"
TAL_CONTENTS, TAL_ATTRIBUTES = TAL_CONTENTS_WITH_ATTRIBUTES = (
TS(CONTENT, CONTENT),
TS(ATTRIBUTES, ATTRIBUTES),
)
TAL_CONTENTS_WITH_STRUCTURE_ATTRIBUTES = (
TS(CONTENT, f"structure {CONTENT}"),
TS(ATTRIBUTES, ATTRIBUTES),
)
HTML_PARSER = etree.HTMLParser()
In [5]:
data_pages_pickle = Path("iowa_data_pages.pk")
# data_pages_pickle.unlink()
In [6]:
data_pages = (
pickle.loads(data_pages_pickle.read_bytes())
if data_pages_pickle.exists()
else (DataPage(SportCategory(None, None, None), "", Path(uuid4().hex)),)
)
# The pickled data has tmp files in it. If they have been erased, it reloads the files.
if not all(item.fp.exists() for item in data_pages):
BASEURL = Path(Path.home(), ".texpander/iowa_sports_stats_json").read_text().strip()
(data,) = (
json.loads(Path(f).read_text())
for f, _ in (urllib.request.urlretrieve(BASEURL),)
)
df = pd.DataFrame(data)
del data
urls = tuple(set(df.url))
data_pages = tuple(
DataPage(
[
SportCategory(*(*item[1:][::-1], url))
for item in df[df["url"] == url]
.loc[:, "sport":"sex"]
.drop_duplicates()
.itertuples()
][0],
url,
Path(fp),
)
for url, (fp, _) in ((url, urllib.request.urlretrieve(url),) for url in urls)
)
data_pages_pickle.write_bytes(pickle.dumps(data_pages))
In [7]:
data_pages[:2]
Out[7]:
In [8]:
HTML_PARSER = etree.HTMLParser()
In [9]:
data_trees = tuple(
DataTree(
*items,
*(
(tree := etree.fromstring(items.fp.read_text(), HTML_PARSER)),
tree.getroottree(),
)[::-1],
)
for items in data_pages
)
In [10]:
data_trees[:2]
Out[10]:
Check for tables in trees.¶
In [11]:
tables = {
item.category: dict(
zip(
*(
(tables := item.tree.xpath("//table")),
tuple(item.root.getpath(table) for table in tables),
)[
::-1
] # reverse so paths are keys
)
)
for item in data_trees
}
In [12]:
TABLE_XPATH = "/html/body/form/div/div[3]/div[3]/div[2]/table"
Tables with defined headers¶
In [13]:
with_thead = {
k: heads[0] for k, v in tables.items() if (heads := v[TABLE_XPATH].xpath("//thead"))
}
Tables without defined headers¶
In [14]:
no_thead = {
k: table
for k, v in tables.items()
if not (table := v[TABLE_XPATH]).xpath("//thead")
}
In [15]:
TAL_CONTENTS, TAL_ATTRIBUTES = TAL_CONTENTS_WITH_ATTRIBUTES = (
TS(CONTENT := "content", CONTENT),
TS(ATTRIBUTES := "attributes", ATTRIBUTES),
)
In [16]:
H2 = CT("h2", (TAL_CONTENTS,)).render
HR = CT("hr", (TAL_ATTRIBUTES,)).render(
attributes=dict(style="border: .3rem solid green; border-radius: 5px;")
)
COLOREDH3 = partial( # create a red h2 HTML
CT("h3", TAL_CONTENTS_WITH_ATTRIBUTES).render,
attributes={"style": "color: green;"},
)
LINK = CT("a", TAL_CONTENTS_WITH_STRUCTURE_ATTRIBUTES).render
Display menu for jumping to a sport¶
In [17]:
ALL_TABLES = (*no_thead.items(), *with_thead.items())
In [18]:
PREFIX = "in-list-"
sports_links, to_top_links = zip(
*(
(
LINK(
attributes=dict(
id=(
id_ := "".join(
(
urllib.parse.parse_qs(
urllib.parse.urlsplit(sport_category.url).query
)["IDSport"][0],
sport_category.sex,
"".join(sport_category.sport.split()).replace(":", ""),
)
)
),
href=f"#{PREFIX}{id_}",
),
content=": ".join(sport_category[:2]),
),
LINK(
attributes=dict(href=f"#{id_}", id=f"{PREFIX}{id_}",),
content=H2(content="Back to Menu"),
),
)
for sport_category, table in ALL_TABLES
)
)
In [19]:
display(
HTML(
CT("ul", (TAL_SC := TS(CONTENT, f"structure {CONTENT}"),)).render(
content=CT("li", (TS("repeat", "content items"), TAL_SC)).render(
items=sports_links
)
)
)
)
Display a screenshot of the scraped site to see if the headers match.¶
Display header texts¶
In [20]:
for top_link, (sport_category, rows) in zip(
to_top_links,
it.islice(
(
(sport_category, get_text_from_tr_rows(table, (1, 2)),)
for sport_category, table in ALL_TABLES
),
0,
len(ALL_TABLES),
),
):
for item in (
HTML(top_link),
sport_category,
HTML(
LINK(
attributes=dict(href=sport_category.url, target="_blank"),
content=H2(content=sport_category.sport),
)
),
Image(filename=get_scaled_screenshot(sport_category.url)),
HTML(COLOREDH3(content=head(rows))),
HTML(HR),
):
display(item)