{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "## Explore the `html` and `datautil` modules added to chamelboots.\n", "\n", "### Verify what exactly the attribute `sourceline` is in lxml. Does it correspond in any way to the raw html source?" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from lxml import etree" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "from chamelboots.html.packages import bootstrap\n", "from chamelboots.constants import HTML_PARSER\n", "from chamelboots.html import get_html_as_data\n", "from chamelboots.datautil import get_from, paths_in_data" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['starter_html']" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[item for item in dir(bootstrap) if not item.startswith(\"_\")]" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "html_element = (\n", " etree.fromstring(bootstrap.starter_html, HTML_PARSER).getroottree().getroot()\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{('addnext',\n", " 'addprevious',\n", " 'append',\n", " 'attrib',\n", " 'base',\n", " 'clear',\n", " 'cssselect',\n", " 'extend',\n", " 'find',\n", " 'findall',\n", " 'findtext',\n", " 'get',\n", " 'getchildren',\n", " 'getiterator',\n", " 'getnext',\n", " 'getparent',\n", " 'getprevious',\n", " 'getroottree',\n", " 'index',\n", " 'insert',\n", " 'items',\n", " 'iter',\n", " 'iterancestors',\n", " 'iterchildren',\n", " 'iterdescendants',\n", " 'iterfind',\n", " 'itersiblings',\n", " 'itertext',\n", " 'keys',\n", " 'makeelement',\n", " 'nsmap',\n", " 'prefix',\n", " 'remove',\n", " 'replace',\n", " 'set',\n", " 'sourceline',\n", " 'tag',\n", " 'tail',\n", " 'text',\n", " 'values',\n", " 'xpath')}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{\n", " tuple(item for item in dir(element) if not item.startswith(\"_\"))\n", " for element in html_element.iterdescendants()\n", "}" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "html_element.sourceline" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(0, ''),\n", " (1, ''),\n", " (2, ' '),\n", " (3, ' '),\n", " (4, ' '),\n", " (5,\n", " ' '),\n", " (6, ' '),\n", " (7, ' '),\n", " (11, ' '),\n", " (12, ' '),\n", " (13, ' '),\n", " (18, ' '),\n", " (23, ' '),\n", " (28, ' Bootstrap title'),\n", " (29, ' '),\n", " (30, ' '),\n", " (31, '
'),\n", " (32, '

Hello, world!

'),\n", " (33, '
'),\n", " (34, ' '),\n", " (35, '')]" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[(i, line) for i, line in enumerate(bootstrap.starter_html.splitlines())]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(, 5), (, 6)]" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[\n", " (element, element.sourceline,)\n", " for element in html_element.iterdescendants()\n", " if element.tag == \"meta\"\n", "]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[(3, ' '),\n", " (4, ' '),\n", " (5,\n", " ' ')]" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "[\n", " (i, line)\n", " for i, line in enumerate(bootstrap.starter_html.splitlines())\n", " if \"meta\" in line\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Conclusion: The `sourceline` attribute in an lxml element doesn't correspond with the source line of the raw html." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## HTML in a Pandas dataframe" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "import itertools as it\n", "import operator as op\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from IPython.display import display, HTML" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "data = get_html_as_data(bootstrap.starter_html)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "paths = paths_in_data(data)" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{2, 5, 8, 11}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "{len(path) for path in paths}" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "scrolled": false }, "outputs": [], "source": [ "by_first = op.itemgetter(0)\n", "dfs = [\n", " pd.DataFrame([op.add(p, (repr(get_from(data, p)),)) for l, p in group])\n", " for key, group in it.groupby(\n", " sorted([(len(path), path) for path in paths], key=by_first), key=by_first,\n", " )\n", "]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Set indices on dataframes to the tag names which are at -3 in columns." ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "for df in dfs:\n", "\n", " stop = i if (i := df.columns.stop - 3) else len(df.columns)\n", " df.index = [list(row)[-1] for i, row in df.loc[:, :stop].iterrows()]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Search for script and link tags since they likely need editing." ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
scripthtmlinner_content0headinner_content7scriptinner_contentNone
scripthtmlinner_content0headinner_content7scriptattributes{'defer': 'defer', 'src': 'https://code.jquery...
scripthtmlinner_content0headinner_content7scripttail'\\n '
scripthtmlinner_content0headinner_content8scriptinner_contentNone
scripthtmlinner_content0headinner_content8scriptattributes{'defer': 'defer', 'src': 'https://cdnjs.cloud...
scripthtmlinner_content0headinner_content8scripttail'\\n '
scripthtmlinner_content0headinner_content9scriptinner_contentNone
scripthtmlinner_content0headinner_content9scriptattributes{'defer': 'defer', 'src': 'https://stackpath.b...
scripthtmlinner_content0headinner_content9scripttail'\\n '
linkhtmlinner_content0headinner_content4linkinner_contentNone
linkhtmlinner_content0headinner_content4linkattributes{'rel': 'stylesheet', 'href': 'https://stackpa...
linkhtmlinner_content0headinner_content4linktail'\\n '
\n", "
" ], "text/plain": [ " 0 1 2 3 4 5 6 7 \\\n", "script html inner_content 0 head inner_content 7 script inner_content \n", "script html inner_content 0 head inner_content 7 script attributes \n", "script html inner_content 0 head inner_content 7 script tail \n", "script html inner_content 0 head inner_content 8 script inner_content \n", "script html inner_content 0 head inner_content 8 script attributes \n", "script html inner_content 0 head inner_content 8 script tail \n", "script html inner_content 0 head inner_content 9 script inner_content \n", "script html inner_content 0 head inner_content 9 script attributes \n", "script html inner_content 0 head inner_content 9 script tail \n", "link html inner_content 0 head inner_content 4 link inner_content \n", "link html inner_content 0 head inner_content 4 link attributes \n", "link html inner_content 0 head inner_content 4 link tail \n", "\n", " 8 \n", "script None \n", "script {'defer': 'defer', 'src': 'https://code.jquery... \n", "script '\\n ' \n", "script None \n", "script {'defer': 'defer', 'src': 'https://cdnjs.cloud... \n", "script '\\n ' \n", "script None \n", "script {'defer': 'defer', 'src': 'https://stackpath.b... \n", "script '\\n ' \n", "link None \n", "link {'rel': 'stylesheet', 'href': 'https://stackpa... \n", "link '\\n ' " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for df in dfs:\n", " try:\n", " display(df.loc[(\"script\", \"link\"), :])\n", " except KeyError:\n", " pass" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "scrolled": false }, "outputs": [ { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
{'lang': 'en'}htmlattribs{'lang': 'en'}
NonehtmltailNone
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345
headhtmlinner_content0headattributes{}
headhtmlinner_content0headtail'\\n '
bodyhtmlinner_content1bodyattributes{}
bodyhtmlinner_content1bodytail'\\n'
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012345678
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content0<cyfunction Comment at 0x7f1398068ae0>inner_content' Required meta tags '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content0<cyfunction Comment at 0x7f1398068ae0>attributes<lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content0<cyfunction Comment at 0x7f1398068ae0>tail'\\n '
metahtmlinner_content0headinner_content1metainner_contentNone
metahtmlinner_content0headinner_content1metaattributes{'charset': 'utf-8'}
metahtmlinner_content0headinner_content1metatail'\\n '
metahtmlinner_content0headinner_content2metainner_contentNone
metahtmlinner_content0headinner_content2metaattributes{'name': 'viewport', 'content': 'width=device-width, initial-scale=1, shrink-to-fit=no'}
metahtmlinner_content0headinner_content2metatail'\\n '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content3<cyfunction Comment at 0x7f1398068ae0>inner_content' Bootstrap CSS '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content3<cyfunction Comment at 0x7f1398068ae0>attributes<lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content3<cyfunction Comment at 0x7f1398068ae0>tail'\\n '
linkhtmlinner_content0headinner_content4linkinner_contentNone
linkhtmlinner_content0headinner_content4linkattributes{'rel': 'stylesheet', 'href': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css', 'integrity': 'sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T', 'crossorigin': 'anonymous'}
linkhtmlinner_content0headinner_content4linktail'\\n '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content5<cyfunction Comment at 0x7f1398068ae0>inner_content' Optional JavaScript '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content5<cyfunction Comment at 0x7f1398068ae0>attributes<lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content5<cyfunction Comment at 0x7f1398068ae0>tail'\\n '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content6<cyfunction Comment at 0x7f1398068ae0>inner_content' jQuery first, then Popper.js, then Bootstrap JS '
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content6<cyfunction Comment at 0x7f1398068ae0>attributes<lxml.etree._ImmutableMapping object at 0x7f139804fc30>
<cyfunction Comment at 0x7f1398068ae0>htmlinner_content0headinner_content6<cyfunction Comment at 0x7f1398068ae0>tail'\\n '
scripthtmlinner_content0headinner_content7scriptinner_contentNone
scripthtmlinner_content0headinner_content7scriptattributes{'defer': 'defer', 'src': 'https://code.jquery.com/jquery-3.3.1.slim.min.js', 'integrity': 'sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo', 'crossorigin': 'anonymous'}
scripthtmlinner_content0headinner_content7scripttail'\\n '
scripthtmlinner_content0headinner_content8scriptinner_contentNone
scripthtmlinner_content0headinner_content8scriptattributes{'defer': 'defer', 'src': 'https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js', 'integrity': 'sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1', 'crossorigin': 'anonymous'}
scripthtmlinner_content0headinner_content8scripttail'\\n '
scripthtmlinner_content0headinner_content9scriptinner_contentNone
scripthtmlinner_content0headinner_content9scriptattributes{'defer': 'defer', 'src': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js', 'integrity': 'sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM', 'crossorigin': 'anonymous'}
scripthtmlinner_content0headinner_content9scripttail'\\n '
titlehtmlinner_content0headinner_content10titleinner_content'Bootstrap title'
titlehtmlinner_content0headinner_content10titleattributes{}
titlehtmlinner_content0headinner_content10titletail'\\n '
divhtmlinner_content1bodyinner_content0divattributes{}
divhtmlinner_content1bodyinner_content0divtail'\\n '
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234567891011
h1htmlinner_content1bodyinner_content0divinner_content0h1inner_content'Hello, world!'
h1htmlinner_content1bodyinner_content0divinner_content0h1attributes{}
h1htmlinner_content1bodyinner_content0divinner_content0h1tail'\\n '
" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "for df in dfs:\n", " display(HTML(df.to_html()))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.0" }, "nikola": { "category": "Chamelboots", "date": "2019-11-19 12:10:48 UTC", "description": "Explore HTML with Pandas data frames.", "link": "", "slug": "explore-html-tools-chamelboots", "tags": "python, code, lxml, chamelboots", "title": "Explore HTML Tools Chamelboots", "type": "text" } }, "nbformat": 4, "nbformat_minor": 2 }