Use the HTMLParser class in Python: a Stack Overflow Answer Submission

A solution to a Stack Overflow question.

Why parsing HTML with regular expressions is an anti-pattern.

In [6]:
from html.parser import HTMLParser

class MyHTMLParser(HTMLParser):

    def __init__(self):
        super().__init__()
        self.data = []
        self.a_tag = None

    def handle_starttag(self, tag, attrs):
        if tag == "a":
            self.a_tag = True

    def handle_data(self, data):
        if self.a_tag:
            self.data.append(data)
            self.a_tag = False

string = """aaa<a class="c-item_foot" href="/news/a/">11r11</a></div>bbb<a class="c-item_foot" href="/news/b/">222</a></div>ccgc<a class="c-item_foot" href="/news/c/">3333a333</a></div>ddd<a class="c-item_foot" href="/news/d/">44a444444</a></div>eee"""
parser = MyHTMLParser()
parser.feed(string)
print(parser.data)
['11r11', '222', '3333a333', '44a444444']