Create HTML Elements with lxml Python Library

Use Python lxml library to create HTML elements.

Resources

In [1]:
from lxml import etree
In [2]:
root = etree.Element("root")
root
Out[2]:
<Element root at 0x7f1235e575a0>
In [3]:
list(root.iter())
Out[3]:
[<Element root at 0x7f1235e575a0>]

Output from etree.SubElement??

Signature:      etree.SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)
Call signature: etree.SubElement(*args, **kwargs)
Type:           cython_function_or_method
String form:    <cyfunction SubElement at 0x7f4f8c0df5c8>
Docstring:     
SubElement(_parent, _tag, attrib=None, nsmap=None, **_extra)

Subelement factory.  This function creates an element instance, and
appends it to an existing element.

Get the attributes from Bootstrap starter template header element.

Try to recreate this with lxml.

In [4]:
head_html = """  <head>
    <!-- Required meta tags -->
    <meta charset="utf-8">
    <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no">

    <!-- Bootstrap CSS -->
    <link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous">
    <!-- Optional JavaScript -->
    <!-- jQuery first, then Popper.js, then Bootstrap JS -->
    <script defer src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"></script>
    <script defer src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"></script>
    <script defer src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"></script>

    <title>${title}</title>
  </head>
"""
In [5]:
tree = etree.fromstring(head_html, etree.HTMLParser())
inner_element_collection = [
    (e.tag, e.attrib) for e in tree.iterdescendants() if isinstance(e.tag, str)
][1:]
inner_element_collection
Out[5]:
[('meta', {'charset': 'utf-8'}),
 ('meta',
  {'name': 'viewport', 'content': 'width=device-width, initial-scale=1, shrink-to-fit=no'}),
 ('link',
  {'rel': 'stylesheet', 'href': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css', 'integrity': 'sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T', 'crossorigin': 'anonymous'}),
 ('script',
  {'defer': 'defer', 'src': 'https://code.jquery.com/jquery-3.3.1.slim.min.js', 'integrity': 'sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo', 'crossorigin': 'anonymous'}),
 ('script',
  {'defer': 'defer', 'src': 'https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js', 'integrity': 'sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1', 'crossorigin': 'anonymous'}),
 ('script',
  {'defer': 'defer', 'src': 'https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js', 'integrity': 'sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM', 'crossorigin': 'anonymous'}),
 ('title', {})]
In [6]:
for tag, attrib in inner_element_collection:
    etree.SubElement(root, tag, attrib=attrib)
In [7]:
header_elements = list(root.iterdescendants())
header_elements
Out[7]:
[<Element meta at 0x7f1235ded870>,
 <Element meta at 0x7f1235deda50>,
 <Element link at 0x7f1235dedd20>,
 <Element script at 0x7f1235ded190>,
 <Element script at 0x7f1235ded230>,
 <Element script at 0x7f1235ded3c0>,
 <Element title at 0x7f1235ded320>]
In [8]:
sample, *_ = header_elements
sample
Out[8]:
<Element meta at 0x7f1235ded870>
In [9]:
[item for item in dir(etree) if not item.startswith("_") and "string" in item]
Out[9]:
['fromstring', 'fromstringlist', 'tostring', 'tostringlist']
In [10]:
[etree.tostring(e) for e in header_elements]
Out[10]:
[b'<meta charset="utf-8"/>',
 b'<meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"/>',
 b'<link rel="stylesheet" href="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/css/bootstrap.min.css" integrity="sha384-ggOyR0iXCbMQv3Xipma34MD+dH/1fQ784/j6cY/iJTQUOhcWr7x9JvoRxT2MZw1T" crossorigin="anonymous"/>',
 b'<script defer="defer" src="https://code.jquery.com/jquery-3.3.1.slim.min.js" integrity="sha384-q8i/X+965DzO0rT7abK41JStQIAqVgRVzpbzo5smXKp4YfRvH+8abtTE1Pi6jizo" crossorigin="anonymous"/>',
 b'<script defer="defer" src="https://cdnjs.cloudflare.com/ajax/libs/popper.js/1.14.7/umd/popper.min.js" integrity="sha384-UO2eT0CpHqdSJQ6hJty5KVphtPhzWj9WO1clHTMGa3JDZwrnQq4sF86dIHNDz0W1" crossorigin="anonymous"/>',
 b'<script defer="defer" src="https://stackpath.bootstrapcdn.com/bootstrap/4.3.1/js/bootstrap.min.js" integrity="sha384-JjSmVgyd0p3pXB1rRibZUAYoIIy6OrQ6VrjIEaFf/nJGzIxFDsf4x0xIM+B07jRM" crossorigin="anonymous"/>',
 b'<title/>']

Create Chameleon templates.

Can lxml be used to create Chamaleon template HTML strings?

In [11]:
html_doc = etree.Element("html")
etree.tostring(html_doc)
Out[11]:
b'<html/>'
In [12]:
class Bunch(dict):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self.__dict__ = self
In [13]:
TAL = "tal:"

Get all the possible "tal" values from HTML in Chameleon docs.

In [14]:
table = """<table class="docutils" border="1">
<colgroup>
<col width="23%">
<col width="78%">
</colgroup>
<thead valign="bottom">
<tr class="row-odd"><th class="head">Statement</th>
<th class="head">Description</th>
</tr>
</thead>
<tbody valign="top">
<tr class="row-even"><td><code class="docutils literal notranslate"><span class="pre">tal:define</span></code></td>
<td>Define variables.</td>
</tr>
<tr class="row-odd"><td><code class="docutils literal notranslate"><span class="pre">tal:switch</span></code></td>
<td>Defines a switch condition</td>
</tr>
<tr class="row-even"><td><code class="docutils literal notranslate"><span class="pre">tal:condition</span></code></td>
<td>Include element only if expression is true.</td>
</tr>
<tr class="row-odd"><td><code class="docutils literal notranslate"><span class="pre">tal:repeat</span></code></td>
<td>Repeat an element.</td>
</tr>
<tr class="row-even"><td><code class="docutils literal notranslate"><span class="pre">tal:case</span></code></td>
<td>Includes element only if expression is equal to parent switch.</td>
</tr>
<tr class="row-odd"><td><code class="docutils literal notranslate"><span class="pre">tal:content</span></code></td>
<td>Substitute the content of an element.</td>
</tr>
<tr class="row-even"><td><code class="docutils literal notranslate"><span class="pre">tal:replace</span></code></td>
<td>Replace the element with dynamic content.</td>
</tr>
<tr class="row-odd"><td><code class="docutils literal notranslate"><span class="pre">tal:omit-tag</span></code></td>
<td>Omit the element tags, leaving only the inner content.</td>
</tr>
<tr class="row-even"><td><code class="docutils literal notranslate"><span class="pre">tal:attributes</span></code></td>
<td>Dynamically change or insert element attributes.</td>
</tr>
<tr class="row-odd"><td><code class="docutils literal notranslate"><span class="pre">tal:on-error</span></code></td>
<td>Substitute the content of an element if processing fails.</td>
</tr>
</tbody>
</table>"""

ref_table = etree.fromstring(table, etree.HTMLParser())
In [15]:
tal_attrs = [
    e.text.replace(TAL, "")
    for e in ref_table.iterdescendants()
    if e.text is not None and e.text.strip() and TAL in e.text
]
tal_attrs
Out[15]:
['define',
 'switch',
 'condition',
 'repeat',
 'case',
 'content',
 'replace',
 'omit-tag',
 'attributes',
 'on-error']
In [16]:
TAG = "meta"
DIV = "div"
In [17]:
chameleon_tal = Bunch(**dict(zip(tal_attrs, tal_attrs)))
chameleon_tal
Out[17]:
{'define': 'define',
 'switch': 'switch',
 'condition': 'condition',
 'repeat': 'repeat',
 'case': 'case',
 'content': 'content',
 'replace': 'replace',
 'omit-tag': 'omit-tag',
 'attributes': 'attributes',
 'on-error': 'on-error'}
In [18]:
element = etree.SubElement(html_doc, DIV, attrib={chameleon_tal.repeat: "item items"})
element.text = "item: ${item}"
html_string = etree.tostring(element, method="html").decode()
element, html_string
Out[18]:
(<Element div at 0x7f1235e136e0>,
 '<div repeat="item items">item: ${item}</div>')
In [19]:
# lxml doesn't allow an attribute with name "tal:{something}"
for value in chameleon_tal.values():
    replacement = f"{TAL}{value}"
    if replacement not in html_string:
        html_string = html_string.replace(value, replacement)
html_string
Out[19]:
'<div tal:repeat="item items">item: ${item}</div>'
In [20]:
import html
from chameleon import PageTemplate
In [21]:
EOL = "\n"
class ChameleonTemplate:
    """Create a Chameleon template string."""

    tal_lookup = {
        "define": "define",
        "switch": "switch",
        "condition": "condition",
        "repeat": "repeat",
        "case": "case",
        "content": "content",
        "replace": "replace",
        "omit-tag": "omit-tag",
        "attributes": "attributes",
        "on-error": "on-error",
    }

    def __init__(
        self,
        tag: str = "div",
        tal: str = None,
        value: str = "item items",
        inner_content: str = "item: ${item}",
    ):
        """Initialize.
        :tag: tag name
        :attrib: the tal attribute, e.g. "repeat" gives "tal:repeat"
        :value: the value of the tal attribute, basically Python code
        """
        self.html_parser = etree.HTMLParser()
        self.root_element = root_element = etree.Element("html")
        try:
            attrib = {self.tal_lookup[tal]: value} if tal is not None else dict()
            self.element = etree.SubElement(root_element, tag, attrib=attrib)
        
            self.inner_content = inner_content
            self._set_inner_content()
            self._set_tals_on_html_string()
        except KeyError:
            print(f'"{tal}" is not a valid tal value. Choices:\n {EOL.join(self.tal_lookup.values())}')
            
        

    def tostring(self, element):
        return etree.tostring(element, method="html").decode()
    
    def _set_inner_content(self):
        try:
            self.element.text = self.inner_content
            self.html_string = self.tostring(self.element)
        except TypeError:  # inner_content is not a string so interpolate in the element
            text = html.unescape(
                self.tostring(self.inner_content.element)
            )
            self.element.text = "{inner}"
            tree = etree.fromstring(
                self.tostring(self.element).format(inner=text),
                self.html_parser,
            )
            _, inner_elements, _ = tree.iterdescendants()
            self.html_string = self.tostring(inner_elements)

    def _set_tals_on_html_string(self):
        """Set tals because lxml doesn't allow an attribute with name "tal:{something}"""
        for value in self.tal_lookup.values():
            replacement = f"tal:{value}"
            if replacement not in self.html_string:
                self.html_string = self.html_string.replace(value, replacement)

    def __getattr__(self, attr):
        """Rather than extend PageTemplate or create a property method, trap the attr lookup."""
        try:
            return getattr(PageTemplate(self.html_string), attr)
        except AttributeError:
            return super().__getattribute__(attr)
In [22]:
from IPython.display import display, HTML
from faker import Faker

fake = Faker()

li = ChameleonTemplate(tag="li", inner_content="${item}")
print(li.html_string)
display(
    HTML(
        ChameleonTemplate(tag="ul", tal="repeat", inner_content=li).render(
            items=[f"catch phrase {i}: {fake.catch_phrase()}" for i in range(1, 11)]
        )
    )
)
<li>${item}</li>
  • catch phrase 1: Ergonomic cohesive analyzer
  • catch phrase 2: Universal mobile system engine
  • catch phrase 3: Horizontal bi-directional product
  • catch phrase 4: Fundamental global circuit
  • catch phrase 5: Adaptive didactic intranet
  • catch phrase 6: Down-sized incremental definition
  • catch phrase 7: Multi-tiered demand-driven circuit
  • catch phrase 8: Vision-oriented exuding contingency
  • catch phrase 9: Innovative zero administration database
  • catch phrase 10: Enterprise-wide high-level conglomeration
In [23]:
repeat_template = ChameleonTemplate(tal="repeat")
print(repeat_template.html_string)
print(repeat_template.render(items=list(range(10))))
<div tal:repeat="item items">item: ${item}</div>
<div>item: 0</div>
<div>item: 1</div>
<div>item: 2</div>
<div>item: 3</div>
<div>item: 4</div>
<div>item: 5</div>
<div>item: 6</div>
<div>item: 7</div>
<div>item: 8</div>
<div>item: 9</div>
In [24]:
ChameleonTemplate(tag="ul", tal="foo", inner_content=li)
"foo" is not a valid tal value. Choices:
 define
switch
condition
repeat
case
content
replace
omit-tag
attributes
on-error
Out[24]:
<__main__.ChameleonTemplate at 0x7f1235ac4730>
In [28]:
title = etree.Element('title')
title, = title.iter()
etree.tostring(title, method='html').decode()
Out[28]:
'<title></title>'