Iterate through rows of HTML tables.

Iterate through rows of HTML tables.

In [19]:
import urllib.request
from pathlib import Path
from pprint import pprint
import itertools as it
import operator as op
import json
from functools import partial
from collections import namedtuple
In [51]:
from lxml import etree
import pandas as pd
from IPython.display import display, HTML
from chamelboots import ChameleonTemplate as CT
from chamelboots import TalStatement as TS
In [3]:
CONTENT = "content"
ATTRIBUTES = "attributes"
TAL_CONTENTS, TAL_ATTRIBUTES = TAL_CONTENTS_WITH_ATTRIBUTES = (TS(CONTENT,CONTENT), TS(ATTRIBUTES, ATTRIBUTES))
HTML_PARSER = etree.HTMLParser()

Load the contents at URL into tempfile.

The data is data scraped from this sports website using a Scrapy script.

I discovered in the process of scraping the website that the HTML tables had different shapes. Some had a head and a body. Some had just rows. I managed to put together some comprehensions that handled both cases. They are difficult to read so here I am going to iterate through all the tables on all the pages to see if a function can be defined that handles all the tables.

In [4]:
BASEURL = Path(Path.home(), '.texpander/iowa_sports_stats_json').read_text().strip()

Get Iowa sports data

In [5]:
(filepath,) = (Path(f) for f, _ in (urllib.request.urlretrieve(BASEURL),))

Sort and group the list of dictionaries by sex and sport.

In [6]:
groups = it.groupby(
    sorted(
        json.loads(filepath.read_text()),
        key=(SORT_KEY := op.itemgetter("sex", "sport")),
    ),
    key=SORT_KEY,
)
TABLE_SIZE = 5
dfs = [
    pd.DataFrame(items).head(TABLE_SIZE)
    for key, items in ((k, list(group)) for k, group in groups)
]
In [7]:
df_table, *_ = dfs
display(df_table)
sport sex url Team Record GP AB R H 2B ... RBI SAC BB SO SB SBA HBP OBP SLG AVG
0 Baseball boys http://quikstatsiowa.com/Public/BBSB/TeamStand... Xavier 41 - 2 - 0 43 1120 371 391 79 ... 306 50 212 162 92 112 53 0.466 0.485 0.349
1 Baseball boys http://quikstatsiowa.com/Public/BBSB/TeamStand... West Lyon 27 - 2 - 0 29 732 298 254 49 ... 240 37 196 157 141 151 45 0.509 0.485 0.347
2 Baseball boys http://quikstatsiowa.com/Public/BBSB/TeamStand... Newman Catholic 38 - 3 - 0 41 1055 429 382 90 ... 346 0 257 139 101 103 88 0.519 0.586 0.362
3 Baseball boys http://quikstatsiowa.com/Public/BBSB/TeamStand... Van Meter 34 - 3 - 0 37 1000 336 340 67 ... 287 43 153 146 125 143 38 0.446 0.454 0.340
4 Baseball boys http://quikstatsiowa.com/Public/BBSB/TeamStand... Central DeWitt 38 - 4 - 0 42 1222 309 408 92 ... 261 20 161 227 125 156 41 0.425 0.470 0.334

5 rows × 22 columns

In [8]:
LINE = 20 * "*"

Store the element trees in an array to avoid repeated network calls.

In [30]:
DF_Row = namedtuple('DF_Row', "Index url sex sport")
In [31]:
element_trees = [
    (
        DF_Row(index, url, (df_row := df_table.iloc[0]).sex, df_row.sport),
        etree.fromstring(Path(fp).read_text(), HTML_PARSER),
    )
    for index, df_table in enumerate(dfs)
    for url in df_table["url"].drop_duplicates()
    for fp, _ in (urllib.request.urlretrieve(url),)
]
In [38]:
element_trees[:40:8] # display boys and girls
Out[38]:
[(DF_Row(Index=0, url='http://quikstatsiowa.com/Public/BBSB/TeamStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD', sex='boys', sport='Baseball'),
  <Element html at 0x7efe16f8ac80>),
 (DF_Row(Index=8, url='http://quikstatsiowa.com/Public/Football/TeamStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', sex='boys', sport='Football'),
  <Element html at 0x7efe170eaf50>),
 (DF_Row(Index=16, url='http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5', sex='boys', sport='Tennis'),
  <Element html at 0x7efe170dc0a0>),
 (DF_Row(Index=24, url='http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688', sex='girls', sport='Golf: IndividualStandings'),
  <Element html at 0x7efe1820ef00>),
 (DF_Row(Index=32, url='http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88', sex='girls', sport='Tennis: IndividualStandings'),
  <Element html at 0x7efe170e2e10>)]
In [39]:
assert len(element_trees) == len(dfs)
In [12]:
H2 = CT("h2", (TAL_CONTENTS, )).render
HR = CT("hr").render()
In [13]:
REDH2 = partial( # create a red h2 HTML
    CT("h2", TAL_CONTENTS_WITH_ATTRIBUTES).render,
    attributes={"style": "color: red;"},
)

Analyze all the tables in all of the HTML documents.

In [14]:
HTML(REDH2(content="If there are more than 2 tables, then a header is printed in red."))
Out[14]:

If there are more than 2 tables, then a header is printed in red.

In [45]:
TABLE_XPATH = "//form/div/div[3]/div[3]/div[2]/table"

for (df_row, tree), root, df_table, label, tables in zip(
    element_trees,
    (tree.getroottree() for df_row, tree in element_trees),
    dfs,
    (df_table[["sex", "sport"]].drop_duplicates() for df_table in dfs),
    (tree.xpath("//table") for df_row, tree in element_trees),
):
    for item, text in (
        (label, ""),
        (df_row, ""),
        (
            tables,
            HTML(H2(content=f"table count: {table_length}"))
            if (table_length := len(tables)) == 2
            else HTML(REDH2(content=f"table count: {table_length}")),
        ),
        (
            [root.getpath(table) for table in tables],
            HTML(H2(content="xpaths to tables")),
        ),
        ("", HTML(HR)),
    ):

        if text:
            display(text)
        if not getattr(item, "empty", True): # handle ambiguity of bool(df)
            display(item)
        elif item:
            display(item)
sex sport
0 boys Baseball
DF_Row(Index=0, url='http://quikstatsiowa.com/Public/BBSB/TeamStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD', sex='boys', sport='Baseball')

table count: 2

[<Element table at 0x7efe16f97d70>, <Element table at 0x7efe16f97780>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Baseball: IndividualBestGameStandings
DF_Row(Index=1, url='http://quikstatsiowa.com/Public/BBSB/IndividualBestGameStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD', sex='boys', sport='Baseball: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16f92f50>, <Element table at 0x7efe16f92c80>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Baseball: IndividualStandings
DF_Row(Index=2, url='http://quikstatsiowa.com/Public/BBSB/IndividualStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD', sex='boys', sport='Baseball: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92280>, <Element table at 0x7efe16f92f00>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Basketball
DF_Row(Index=3, url='http://quikstatsiowa.com/Public/Basketball/TeamStandings.aspx?IDSport=57C38F60-B323-4087-A557-9ED925DC546D', sex='boys', sport='Basketball')

table count: 2

[<Element table at 0x7efe16f92d20>, <Element table at 0x7efe16f92f50>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Bowling
DF_Row(Index=4, url='http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=DA3506E8-E4CA-4175-BF69-BEBBDC2FD878', sex='boys', sport='Bowling')

table count: 2

[<Element table at 0x7efe16f46a50>, <Element table at 0x7efe16f46c80>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Bowling: IndividualStandings
DF_Row(Index=5, url='http://quikstatsiowa.com/Public/Bowling/IndividualStandings.aspx?IDSport=DA3506E8-E4CA-4175-BF69-BEBBDC2FD878', sex='boys', sport='Bowling: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92d70>, <Element table at 0x7efe16f92d20>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Fall Golf
DF_Row(Index=6, url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=92A34DE4-ACB3-4282-BF29-571A97DE1946', sex='boys', sport='Fall Golf')

table count: 2

[<Element table at 0x7efe16fad4b0>, <Element table at 0x7efe16fb3870>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Fall Golf: IndividualStandings
DF_Row(Index=7, url='http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=92A34DE4-ACB3-4282-BF29-571A97DE1946', sex='boys', sport='Fall Golf: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92050>, <Element table at 0x7efe16fa9fa0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Football
DF_Row(Index=8, url='http://quikstatsiowa.com/Public/Football/TeamStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', sex='boys', sport='Football')

table count: 3

[<Element table at 0x7efe16f92d20>,
 <Element table at 0x7efe16fa9d70>,
 <Element table at 0x7efe16fa9be0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/div[10]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Football: IndividualBestGameStandings
DF_Row(Index=9, url='http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', sex='boys', sport='Football: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16fa9e10>, <Element table at 0x7efe16fa9320>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Football: IndividualStandings
DF_Row(Index=10, url='http://quikstatsiowa.com/Public/Football/IndividualStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC', sex='boys', sport='Football: IndividualStandings')

table count: 3

[<Element table at 0x7efe16fa9960>,
 <Element table at 0x7efe16fa9370>,
 <Element table at 0x7efe16fa9f00>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/div[10]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Soccer
DF_Row(Index=11, url='http://quikstatsiowa.com/Public/Soccer/TeamStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B', sex='boys', sport='Soccer')

table count: 2

[<Element table at 0x7efe16fa9e10>, <Element table at 0x7efe16fa93c0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Soccer: IndividualBestGameStandings
DF_Row(Index=12, url='http://quikstatsiowa.com/Public/Soccer/IndividualBestGameStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B', sex='boys', sport='Soccer: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16f4b3c0>, <Element table at 0x7efe16f4b5a0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Soccer: IndividualStandings
DF_Row(Index=13, url='http://quikstatsiowa.com/Public/Soccer/IndividualStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B', sex='boys', sport='Soccer: IndividualStandings')

table count: 2

[<Element table at 0x7efe16fa9af0>, <Element table at 0x7efe16f4b500>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Spring Golf
DF_Row(Index=14, url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=FC614ADE-B5DA-4012-A95E-0FD2A594FE9D', sex='boys', sport='Spring Golf')

table count: 2

[<Element table at 0x7efe16fa9c30>, <Element table at 0x7efe16fa9aa0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Spring Golf: IndividualStandings
DF_Row(Index=15, url='http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=FC614ADE-B5DA-4012-A95E-0FD2A594FE9D', sex='boys', sport='Spring Golf: IndividualStandings')

table count: 2

[<Element table at 0x7efe16fa9e10>, <Element table at 0x7efe16fa9eb0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Tennis
DF_Row(Index=16, url='http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5', sex='boys', sport='Tennis')

table count: 2

[<Element table at 0x7efe16f4b640>, <Element table at 0x7efe16f4b4b0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 boys Tennis: IndividualStandings
DF_Row(Index=17, url='http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5', sex='boys', sport='Tennis: IndividualStandings')

table count: 2

[<Element table at 0x7efe16fa9b40>, <Element table at 0x7efe16fa9a00>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Basketball
DF_Row(Index=18, url='http://quikstatsiowa.com/Public/Basketball/TeamStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA', sex='girls', sport='Basketball')

table count: 2

[<Element table at 0x7efe16fae3c0>, <Element table at 0x7efe16f4b960>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Basketball: IndividualBestGameStandings
DF_Row(Index=19, url='http://quikstatsiowa.com/Public/Basketball/IndividualBestGameStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA', sex='girls', sport='Basketball: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16fb60f0>, <Element table at 0x7efe16fb6fa0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Basketball: IndividualStandings
DF_Row(Index=20, url='http://quikstatsiowa.com/Public/Basketball/IndividualStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA', sex='girls', sport='Basketball: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f4b050>, <Element table at 0x7efe16f4b960>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Bowling
DF_Row(Index=21, url='http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=0C6DFBCF-98C4-4B01-9F56-17B02E9E47E1', sex='girls', sport='Bowling')

table count: 2

[<Element table at 0x7efe16f92690>, <Element table at 0x7efe16f92190>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Bowling: IndividualStandings
DF_Row(Index=22, url='http://quikstatsiowa.com/Public/Bowling/IndividualStandings.aspx?IDSport=0C6DFBCF-98C4-4B01-9F56-17B02E9E47E1', sex='girls', sport='Bowling: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92d20>, <Element table at 0x7efe16f92e10>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Golf
DF_Row(Index=23, url='http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688', sex='girls', sport='Golf')

table count: 2

[<Element table at 0x7efe16f92a00>, <Element table at 0x7efe16f92f50>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Golf: IndividualStandings
DF_Row(Index=24, url='http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688', sex='girls', sport='Golf: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f4b140>, <Element table at 0x7efe16f4b960>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Soccer
DF_Row(Index=25, url='http://quikstatsiowa.com/Public/Soccer/TeamStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0', sex='girls', sport='Soccer')

table count: 2

[<Element table at 0x7efe16f92cd0>, <Element table at 0x7efe16f92d70>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Soccer: IndividualBestGameStandings
DF_Row(Index=26, url='http://quikstatsiowa.com/Public/Soccer/IndividualBestGameStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0', sex='girls', sport='Soccer: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16fa3410>, <Element table at 0x7efe16f92e10>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Soccer: IndividualStandings
DF_Row(Index=27, url='http://quikstatsiowa.com/Public/Soccer/IndividualStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0', sex='girls', sport='Soccer: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92690>, <Element table at 0x7efe170e59b0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Softball
DF_Row(Index=28, url='http://quikstatsiowa.com/Public/BBSB/TeamStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16', sex='girls', sport='Softball')

table count: 2

[<Element table at 0x7efe170e8c30>, <Element table at 0x7efe16fb5550>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Softball: IndividualBestGameStandings
DF_Row(Index=29, url='http://quikstatsiowa.com/Public/BBSB/IndividualBestGameStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16', sex='girls', sport='Softball: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16f92f50>, <Element table at 0x7efe16f92cd0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Softball: IndividualStandings
DF_Row(Index=30, url='http://quikstatsiowa.com/Public/BBSB/IndividualStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16', sex='girls', sport='Softball: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f92960>, <Element table at 0x7efe16f92d70>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Tennis
DF_Row(Index=31, url='http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88', sex='girls', sport='Tennis')

table count: 2

[<Element table at 0x7efe16f92e10>, <Element table at 0x7efe16fa1870>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Tennis: IndividualStandings
DF_Row(Index=32, url='http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88', sex='girls', sport='Tennis: IndividualStandings')

table count: 2

[<Element table at 0x7efe16f8bc30>, <Element table at 0x7efe16fa11e0>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Volleyball
DF_Row(Index=33, url='http://quikstatsiowa.com/Public/Volleyball/TeamStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773', sex='girls', sport='Volleyball')

table count: 2

[<Element table at 0x7efe16f92730>, <Element table at 0x7efe16f92280>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Volleyball: IndividualBestGameStandings
DF_Row(Index=34, url='http://quikstatsiowa.com/Public/Volleyball/IndividualBestGameStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773', sex='girls', sport='Volleyball: IndividualBestGameStandings')

table count: 2

[<Element table at 0x7efe16f92aa0>, <Element table at 0x7efe16fb1d20>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

sex sport
0 girls Volleyball: IndividualStandings
DF_Row(Index=35, url='http://quikstatsiowa.com/Public/Volleyball/IndividualStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773', sex='girls', sport='Volleyball: IndividualStandings')

table count: 2

[<Element table at 0x7efe16fb1a50>, <Element table at 0x7efe16fb1050>]

xpaths to tables

['/html/body/form/div/div[2]/table',
 '/html/body/form/div/div[3]/div[3]/div[2]/table']

Assert that all the HTML docs of interest have a table at the same xpath

In [46]:
TABLE_XPATH = "/html/body/form/div/div[3]/div[3]/div[2]/table"

table_xpaths_per_page = [
    [root.getpath(table) for table in tree.xpath(TABLE_XPATH)]
    for root, tree in ((tree.getroottree(), tree) for df_row, tree in element_trees)
]
display(table_xpaths_per_page[:5]) # display first 5

assert (set_of_xpaths := set(it.chain.from_iterable(table_xpaths_per_page))) == set(
    (TABLE_XPATH,)
)
assert len(set_of_xpaths) == 1
[['/html/body/form/div/div[3]/div[3]/div[2]/table'],
 ['/html/body/form/div/div[3]/div[3]/div[2]/table'],
 ['/html/body/form/div/div[3]/div[3]/div[2]/table'],
 ['/html/body/form/div/div[3]/div[3]/div[2]/table'],
 ['/html/body/form/div/div[3]/div[3]/div[2]/table']]

Analyze tables at path TABLE_XPATH

In [59]:
for df_row, tree in element_trees:
    headers = [
        [t for text in e.itertext() if (t := text.strip())]
        for table in tree.xpath(TABLE_XPATH)
        for e in table.xpath("//thead")
    ] or [
        [t for text in e.itertext() if (t := text.strip())]
        for table in tree.xpath(TABLE_XPATH)
        for e in table.xpath("//tr")
    ][1:2] # No thead then slice 1:2 is the header labels
    display(HTML(H2(content=' '.join(df_row[-2:][::-1]))))
    print(df_row.url)
    print(headers)

Baseball boys

http://quikstatsiowa.com/Public/BBSB/TeamStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD
[['Team', 'Record', 'GP', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SAC', 'BB', 'SO', 'SB', 'SBA', 'HBP', 'OBP', 'SLG', 'AVG']]

Baseball: IndividualBestGameStandings boys

http://quikstatsiowa.com/Public/BBSB/IndividualBestGameStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD
[['Athlete', 'Team', 'Runs', 'Date', 'Location', 'Opponent']]

Baseball: IndividualStandings boys

http://quikstatsiowa.com/Public/BBSB/IndividualStandings.aspx?IDSport=B25923B5-D303-41CA-B9B3-DF2527D84CDD
[['Athlete', 'Team', 'GP', 'GS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SAC', 'BB', 'SO', 'SB', 'SBA', 'HBP', 'OBP', 'SLG', 'AVG']]

Basketball boys

http://quikstatsiowa.com/Public/Basketball/TeamStandings.aspx?IDSport=57C38F60-B323-4087-A557-9ED925DC546D
[['Team', 'Record', 'FG Att', 'FG Made', 'FG %', '3 pt Att', '3 pt Made', '3 pt %', 'FT Att', 'FT Made', 'FT %', 'Pts', 'Pts/G']]

Bowling boys

http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=DA3506E8-E4CA-4175-BF69-BEBBDC2FD878
[['Team', 'Record', 'Matches', 'Match Score Avg', 'Individual Game Average', 'Baker Game Average']]

Bowling: IndividualStandings boys

http://quikstatsiowa.com/Public/Bowling/IndividualStandings.aspx?IDSport=DA3506E8-E4CA-4175-BF69-BEBBDC2FD878
[['Bowler', 'Team', 'Games', 'Total Pins', 'Game Avg', 'High Game', 'Series Avg', 'High Series']]

Fall Golf boys

http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=92A34DE4-ACB3-4282-BF29-571A97DE1946
[['Team', '9 Hole Matches', '9 Hole Avg', 'Top 4 Players Total', '18 Hole Matches', '18 Hole Avg', 'Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg', 'Top 4 Players Total', 'Adjusted by Course Rating']]

Fall Golf: IndividualStandings boys

http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=92A34DE4-ACB3-4282-BF29-571A97DE1946
[['Athlete', 'Team', '9 Hole', 'Rounds', '9 Hole', 'Avg', 'Low 9 Hole', 'Score', '18 Hole', 'Rounds', '18 Hole', 'Avg', 'Low 18 Hole', 'Score', 'Combined (9 & 18) Avg', 'Adjusted by Course Rating']]

Football boys

http://quikstatsiowa.com/Public/Football/TeamStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC
[['Offense', 'Passing', 'Rushing', 'Receiving', 'Total Offense', '(Rushing + Receiving)', 'Total Offense', '(Rushing + Passing)', 'Scoring']]

Football: IndividualBestGameStandings boys

http://quikstatsiowa.com/Public/Football/IndividualBestGameStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC
[['Athlete', 'Team', 'Passing Yards', 'Date', 'Location', 'Opponent']]

Football: IndividualStandings boys

http://quikstatsiowa.com/Public/Football/IndividualStandings.aspx?IDSport=91A308DE-5763-4DAA-8C03-9AF66611E0BC
[['Offense', 'Passing', 'Rushing', 'Receiving', 'Total Offense', '(Rushing + Receiving)', 'Total Offense', '(Rushing + Passing)', 'Scoring']]

Soccer boys

http://quikstatsiowa.com/Public/Soccer/TeamStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B
[['Team', 'Record', 'GP', 'G', 'A', 'P', 'Sh', 'Sh %', 'SOG', 'SOG %', 'CK', 'PKM', 'PKA', 'GA', 'GA Avg', 'S', 'S %']]

Soccer: IndividualBestGameStandings boys

http://quikstatsiowa.com/Public/Soccer/IndividualBestGameStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B
[['Athlete', 'Team', 'Goals', 'Date', 'Location', 'Opponent']]

Soccer: IndividualStandings boys

http://quikstatsiowa.com/Public/Soccer/IndividualStandings.aspx?IDSport=9D4214D2-EBE6-429E-9005-C11D2A29C89B
[['Athlete', 'Team', 'GP', 'GS', 'G', 'A', 'P', 'Sh', 'Sh %', 'SOG', 'SOG %', 'PKM', 'PKA', 'GM', 'GA', 'GA Avg', 'S', 'S %']]

Spring Golf boys

http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=FC614ADE-B5DA-4012-A95E-0FD2A594FE9D
[['Team', '9 Hole Matches', '9 Hole Avg', 'Top 4 Players Total', '18 Hole Matches', '18 Hole Avg', 'Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg', 'Top 4 Players Total', 'Adjusted by Course Rating']]

Spring Golf: IndividualStandings boys

http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=FC614ADE-B5DA-4012-A95E-0FD2A594FE9D
[['Athlete', 'Team', '9 Hole', 'Rounds', '9 Hole', 'Avg', 'Low 9 Hole', 'Score', '18 Hole', 'Rounds', '18 Hole', 'Avg', 'Low 18 Hole', 'Score', 'Combined (9 & 18) Avg', 'Adjusted by Course Rating']]

Tennis boys

http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5
[['Team', 'Record', 'Singles Won', 'Singles Lost', 'Doubles Won', 'Doubles Lost']]

Tennis: IndividualStandings boys

http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=19786FF3-ADA3-4C7A-A94F-FAC0811118F5
[['Athlete', 'Team', 'Match Record', 'Set Record', 'Game Record']]

Basketball girls

http://quikstatsiowa.com/Public/Basketball/TeamStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA
[['Team', 'Record', 'FG Att', 'FG Made', 'FG %', '3 pt Att', '3 pt Made', '3 pt %', 'FT Att', 'FT Made', 'FT %', 'Pts', 'Pts/G']]

Basketball: IndividualBestGameStandings girls

http://quikstatsiowa.com/Public/Basketball/IndividualBestGameStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA
[['Athlete', 'Team', 'Points', 'Date', 'Location', 'Opponent']]

Basketball: IndividualStandings girls

http://quikstatsiowa.com/Public/Basketball/IndividualStandings.aspx?IDSport=B657ECDF-ECD0-4429-810A-9F9274EC4AAA
[['Athlete', 'Team', 'G', 'FG Att', 'FG Made', 'FG %', '3 pt Att', '3 pt Made', '3 pt %', 'FT Att', 'FT Made', 'FT %', 'Pts', 'Pts/G']]

Bowling girls

http://quikstatsiowa.com/Public/Bowling/TeamStandings.aspx?IDSport=0C6DFBCF-98C4-4B01-9F56-17B02E9E47E1
[['Team', 'Record', 'Matches', 'Match Score Avg', 'Individual Game Average', 'Baker Game Average']]

Bowling: IndividualStandings girls

http://quikstatsiowa.com/Public/Bowling/IndividualStandings.aspx?IDSport=0C6DFBCF-98C4-4B01-9F56-17B02E9E47E1
[['Bowler', 'Team', 'Games', 'Total Pins', 'Game Avg', 'High Game', 'Series Avg', 'High Series']]

Golf girls

http://quikstatsiowa.com/Public/Golf/TeamStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688
[['Team', '9 Hole Matches', '9 Hole Avg', 'Top 4 Players Total', '18 Hole Matches', '18 Hole Avg', 'Top 4 Players Total', 'Combined (9 & 18) Adjusted Avg', 'Top 4 Players Total', 'Adjusted by Course Rating']]

Golf: IndividualStandings girls

http://quikstatsiowa.com/Public/Golf/IndividualStandings.aspx?IDSport=6DC124A1-D8C4-4F88-84EF-5C6B4FD4A688
[['Athlete', 'Team', '9 Hole', 'Rounds', '9 Hole', 'Avg', 'Low 9 Hole', 'Score', '18 Hole', 'Rounds', '18 Hole', 'Avg', 'Low 18 Hole', 'Score', 'Combined (9 & 18) Avg', 'Adjusted by Course Rating']]

Soccer girls

http://quikstatsiowa.com/Public/Soccer/TeamStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0
[['Team', 'Record', 'GP', 'G', 'A', 'P', 'Sh', 'Sh %', 'SOG', 'SOG %', 'CK', 'PKM', 'PKA', 'GA', 'GA Avg', 'S', 'S %']]

Soccer: IndividualBestGameStandings girls

http://quikstatsiowa.com/Public/Soccer/IndividualBestGameStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0
[['Athlete', 'Team', 'Goals', 'Date', 'Location', 'Opponent']]

Soccer: IndividualStandings girls

http://quikstatsiowa.com/Public/Soccer/IndividualStandings.aspx?IDSport=65E5DA09-90C6-45F5-847A-F9A84FD9C5B0
[['Athlete', 'Team', 'GP', 'GS', 'G', 'A', 'P', 'Sh', 'Sh %', 'SOG', 'SOG %', 'PKM', 'PKA', 'GM', 'GA', 'GA Avg', 'S', 'S %']]

Softball girls

http://quikstatsiowa.com/Public/BBSB/TeamStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16
[['Team', 'Record', 'GP', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SAC', 'BB', 'SO', 'SB', 'SBA', 'HBP', 'OBP', 'SLG', 'AVG']]

Softball: IndividualBestGameStandings girls

http://quikstatsiowa.com/Public/BBSB/IndividualBestGameStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16
[['Athlete', 'Team', 'Runs', 'Date', 'Location', 'Opponent']]

Softball: IndividualStandings girls

http://quikstatsiowa.com/Public/BBSB/IndividualStandings.aspx?IDSport=D97DD7D0-0BEF-404A-B041-7E51ACFDBD16
[['Athlete', 'Team', 'GP', 'GS', 'AB', 'R', 'H', '2B', '3B', 'HR', 'RBI', 'SAC', 'BB', 'SO', 'SB', 'SBA', 'HBP', 'OBP', 'SLG', 'AVG']]

Tennis girls

http://quikstatsiowa.com/Public/Tennis/TeamStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88
[['Team', 'Record', 'Singles Won', 'Singles Lost', 'Doubles Won', 'Doubles Lost']]

Tennis: IndividualStandings girls

http://quikstatsiowa.com/Public/Tennis/IndividualStandings.aspx?IDSport=6086C2DF-4661-4701-BFF1-3BB32C081B88
[['Athlete', 'Team', 'Match Record', 'Set Record', 'Game Record']]

Volleyball girls

http://quikstatsiowa.com/Public/Volleyball/TeamStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773
[['Team', 'Record', 'Sets', 'Kills', 'Kill', 'Err', 'Attack', 'Attempts', 'Kill', 'Eff', 'Kills', 'per Set', 'Serve', 'Success', 'Serves', 'Serve', 'Eff', 'Aces', 'Aces', 'per Set']]

Volleyball: IndividualBestGameStandings girls

http://quikstatsiowa.com/Public/Volleyball/IndividualBestGameStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773
[['Athlete', 'Team', 'Aces', 'Date', 'Location', 'Opponent']]

Volleyball: IndividualStandings girls

http://quikstatsiowa.com/Public/Volleyball/IndividualStandings.aspx?IDSport=83298383-D7D7-4670-9C6B-24DDB8B2E773
[['Athlete', 'Team', 'Sets', 'Kills', 'Kill', 'Err', 'Attack', 'Attempts', 'Kill', 'Eff', 'Kills', 'per Set', 'Serve', 'Success', 'Serves', 'Serve', 'Eff', 'Aces', 'Aces', 'per Set']]
In [ ]: