Explore HTML Tools Chamelboots

Explore the html and datautil modules added to chamelboots.

Verify what exactly the attribute sourceline is in lxml. Does it correspond in any way to the raw html source?

In [1]:
from lxml import etree
In [2]:
from chamelboots.html.packages import bootstrap
from chamelboots.constants import HTML_PARSER
from chamelboots.html import get_html_as_data
from chamelboots.datautil import get_from, paths_in_data
In [3]:
[item for item in dir(bootstrap) if not item.startswith("_")]
Out[3]:
['starter_html']
In [4]:
html_element = (
    etree.fromstring(bootstrap.starter_html, HTML_PARSER).getroottree().getroot()
)
In [5]:
{
    tuple(item for item in dir(element) if not item.startswith("_"))
    for element in html_element.iterdescendants()
}
Out[5]:
{('addnext',
  'addprevious',
  'append',
  'attrib',
  'base',
  'clear',
  'cssselect',
  'extend',
  'find',
  'findall',
  'findtext',
  'get',
  'getchildren',
  'getiterator',
  'getnext',
  'getparent',
  'getprevious',
  'getroottree',
  'index',
  'insert',
  'items',
  'iter',
  'iterancestors',
  'iterchildren',
  'iterdescendants',
  'iterfind',
  'itersiblings',
  'itertext',
  'keys',
  'makeelement',
  'nsmap',
  'prefix',
  'remove',
  'replace',
  'set',
  'sourceline',
  'tag',
  'tail',
  'text',
  'values',
  'xpath')}
In [6]:
html_element.sourceline
Out[6]:
2
In [7]:
[(i, line) for i, line in enumerate(bootstrap.starter_html.splitlines())]
Out[7]:
[(0, '<!doctype html>'),
 (1, '<html lang="en">'),
 (2, '  <head>'),
 (3, '    <!-- Required meta tags -->'),
 (4, '    <meta charset="utf-8">'),
 (5,
  '    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">'),
 (6, '    <!-- Bootstrap CSS -->'),
 (7, '    <link'),
 (8, '    rel="stylesheet"'),
 (9,
  '    href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css"'),
 (10,
  '    integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">'),
 (11, '    <!-- Optional JavaScript -->'),
 (12, '    <!-- jQuery first, then Popper.js, then Bootstrap JS -->'),
 (13, '    <script'),
 (14, '    defer="defer"'),
 (15, '    src="https://code.jquery.com/jquery-3.3.1.slim.min.js"'),
 (16,
  '    integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo"'),
 (17, '    crossorigin="anonymous"></script>'),
 (18, '    <script'),
 (19, '    defer="defer"'),
 (20,
  '    src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js"'),
 (21,
  '    integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1"'),
 (22, '    crossorigin="anonymous"></script>'),
 (23, '    <script'),
 (24, '    defer="defer"'),
 (25,
  '    src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js"'),
 (26,
  '    integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM"'),
 (27, '    crossorigin="anonymous"></script>'),
 (28, '    <title>Bootstrap title</title>'),
 (29, '  </head>'),
 (30, '  <body>'),
 (31, '    <div>'),
 (32, '        <h1>Hello, world!</h1>'),
 (33, '    </div>'),
 (34, '  </body>'),
 (35, '</html>')]
In [8]:
[
    (element, element.sourceline,)
    for element in html_element.iterdescendants()
    if element.tag == "meta"
]
Out[8]:
[(<Element meta at 0x7f137b6cdf50>, 5), (<Element meta at 0x7f137b6cd460>, 6)]
In [9]:
[
    (i, line)
    for i, line in enumerate(bootstrap.starter_html.splitlines())
    if "meta" in line
]
Out[9]:
[(3, '    <!-- Required meta tags -->'),
 (4, '    <meta charset="utf-8">'),
 (5,
  '    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">')]

Conclusion: The sourceline attribute in an lxml element doesn't correspond with the source line of the raw html.

HTML in a Pandas dataframe

In [10]:
import itertools as it
import operator as op
In [11]:
import pandas as pd
from IPython.display import display, HTML
In [12]:
data = get_html_as_data(bootstrap.starter_html)
In [13]:
paths = paths_in_data(data)
In [14]:
{len(path) for path in paths}
Out[14]:
{2, 5, 8, 11}
In [15]:
by_first = op.itemgetter(0)
dfs = [
    pd.DataFrame([op.add(p, (repr(get_from(data, p)),)) for l, p in group])
    for key, group in it.groupby(
        sorted([(len(path), path) for path in paths], key=by_first), key=by_first,
    )
]

Set indices on dataframes to the tag names which are at -3 in columns.

In [16]:
for df in dfs:

    stop = i if (i := df.columns.stop - 3) else len(df.columns)
    df.index = [list(row)[-1] for i, row in df.loc[:, :stop].iterrows()]
In [17]:
for df in dfs:
    try:
        display(df.loc[("script", "link"), :])
    except KeyError:
        pass
0 1 2 3 4 5 6 7 8
script html inner_content 0 head inner_content 7 script inner_content None
script html inner_content 0 head inner_content 7 script attributes {'defer': 'defer', 'src': 'https://code.jquery...
script html inner_content 0 head inner_content 7 script tail '\n '
script html inner_content 0 head inner_content 8 script inner_content None
script html inner_content 0 head inner_content 8 script attributes {'defer': 'defer', 'src': 'https://cdnjs.cloud...
script html inner_content 0 head inner_content 8 script tail '\n '
script html inner_content 0 head inner_content 9 script inner_content None
script html inner_content 0 head inner_content 9 script attributes {'defer': 'defer', 'src': 'https://stackpath.b...
script html inner_content 0 head inner_content 9 script tail '\n '
link html inner_content 0 head inner_content 4 link inner_content None
link html inner_content 0 head inner_content 4 link attributes {'rel': 'stylesheet', 'href': 'https://stackpa...
link html inner_content 0 head inner_content 4 link tail '\n '
In [18]:
for df in dfs:
    display(HTML(df.to_html()))
0 1 2
{'lang': 'en'} html attribs {'lang': 'en'}
None html tail None
0 1 2 3 4 5
head html inner_content 0 head attributes {}
head html inner_content 0 head tail '\n '
body html inner_content 1 body attributes {}
body html inner_content 1 body tail '\n'
0 1 2 3 4 5 6 7 8
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 0 <cyfunction Comment at 0x7f1398068ae0> inner_content ' Required meta tags '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 0 <cyfunction Comment at 0x7f1398068ae0> attributes <lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 0 <cyfunction Comment at 0x7f1398068ae0> tail '\n '
meta html inner_content 0 head inner_content 1 meta inner_content None
meta html inner_content 0 head inner_content 1 meta attributes {'charset': 'utf-8'}
meta html inner_content 0 head inner_content 1 meta tail '\n '
meta html inner_content 0 head inner_content 2 meta inner_content None
meta html inner_content 0 head inner_content 2 meta attributes {'name': 'viewport', 'content': 'width=device-width, initial-scale=1, shrink-to-fit=no'}
meta html inner_content 0 head inner_content 2 meta tail '\n '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 3 <cyfunction Comment at 0x7f1398068ae0> inner_content ' Bootstrap CSS '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 3 <cyfunction Comment at 0x7f1398068ae0> attributes <lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 3 <cyfunction Comment at 0x7f1398068ae0> tail '\n '
link html inner_content 0 head inner_content 4 link inner_content None
link html inner_content 0 head inner_content 4 link attributes {'rel': 'stylesheet', 'href': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css', 'integrity': 'sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T', 'crossorigin': 'anonymous'}
link html inner_content 0 head inner_content 4 link tail '\n '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 5 <cyfunction Comment at 0x7f1398068ae0> inner_content ' Optional JavaScript '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 5 <cyfunction Comment at 0x7f1398068ae0> attributes <lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 5 <cyfunction Comment at 0x7f1398068ae0> tail '\n '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 6 <cyfunction Comment at 0x7f1398068ae0> inner_content ' jQuery first, then Popper.js, then Bootstrap JS '
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 6 <cyfunction Comment at 0x7f1398068ae0> attributes <lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0> html inner_content 0 head inner_content 6 <cyfunction Comment at 0x7f1398068ae0> tail '\n '
script html inner_content 0 head inner_content 7 script inner_content None
script html inner_content 0 head inner_content 7 script attributes {'defer': 'defer', 'src': 'https://code.jquery.com/jquery-3.3.1.slim.min.js', 'integrity': 'sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo', 'crossorigin': 'anonymous'}
script html inner_content 0 head inner_content 7 script tail '\n '
script html inner_content 0 head inner_content 8 script inner_content None
script html inner_content 0 head inner_content 8 script attributes {'defer': 'defer', 'src': 'https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js', 'integrity': 'sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1', 'crossorigin': 'anonymous'}
script html inner_content 0 head inner_content 8 script tail '\n '
script html inner_content 0 head inner_content 9 script inner_content None
script html inner_content 0 head inner_content 9 script attributes {'defer': 'defer', 'src': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js', 'integrity': 'sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM', 'crossorigin': 'anonymous'}
script html inner_content 0 head inner_content 9 script tail '\n '
title html inner_content 0 head inner_content 10 title inner_content 'Bootstrap title'
title html inner_content 0 head inner_content 10 title attributes {}
title html inner_content 0 head inner_content 10 title tail '\n '
div html inner_content 1 body inner_content 0 div attributes {}
div html inner_content 1 body inner_content 0 div tail '\n '
0 1 2 3 4 5 6 7 8 9 10 11
h1 html inner_content 1 body inner_content 0 div inner_content 0 h1 inner_content 'Hello, world!'
h1 html inner_content 1 body inner_content 0 div inner_content 0 h1 attributes {}
h1 html inner_content 1 body inner_content 0 div inner_content 0 h1 tail '\n '