html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//*') print(result)
#运行结果: [<Element html at 0x10510d9c8>, <Element body at 0x10510da08>, <Element div at 0x10510da48>, <Element ul at 0x10510da88>, <Element li at 0x10510dac8>, <Element a at 0x10510db48>, <Element li at 0x10510db88>, <Element a at 0x10510dbc8>, <Element li at 0x10510dc08>, <Element a at 0x10510db08>, <Element li at 0x10510dc48>, <Element a at 0x10510dc88>, <Element li at 0x10510dcc8>, <Element a at 0x10510dd08>]
当然,此处匹配也可以指定节点名称。如果想获取所有li节点,示例如下:
from lxml import etree html = etree.parse('./test.html', etree.HTMLParser()) result = html.xpath('//li') print(result) print(result[0])
#运行结果 [<Element li at 0x105849208>, <Element li at 0x105849248>, <Element li at 0x105849288>, <Element li at 0x1058492c8>, <Element li at 0x105849308>] <Element li at 0x105849208>
from lxml import etree text = ''' <li class="li li-first"><a href="link.html">first item</a></li> ''' html = etree.HTML(text) result = html.xpath('//li[contains(@class, "li")]/a/text()') print(result)
8 多属性匹配
and:用来连接匹配的多个属性
from lxml import etree text = ''' <li class="li li-first" name="item"><a href="link.html">first item</a></li> ''' html = etree.HTML(text) result = html.xpath('//li[contains(@class, "li") and @name="item"]/a/text()') print(result)
text = ''' <div> <ul> <li class="item-0"><a href="link1.html"><span>first item</span></a></li> <li class="item-1"><a href="link2.html">second item</a></li> <li class="item-inactive"><a href="link3.html">third item</a></li> <li class="item-1"><a href="link4.html">fourth item</a></li> <li class="item-0"><a href="link5.html">fifth item</a> </ul> </div> ''' html = etree.HTML(text) result = html.xpath('//li[1]/ancestor::*') print(result) result = html.xpath('//li[1]/ancestor::div') print(result) result = html.xpath('//li[1]/attribute::*') print(result) result = html.xpath('//li[1]/child::a[@href="link1.html"]') print(result) result = html.xpath('//li[1]/descendant::span') print(result) result = html.xpath('//li[1]/following::*[2]') print(result) result = html.xpath('//li[1]/following-sibling::*')<code class="lang-python"> <span class="kwd">print</span><span class="pun">(</span><span class="pln">result</span><span class="pun">)</span>
#运行结果 [<Element html at 0x107941808>, <Element body at 0x1079418c8>, <Element div at 0x107941908>, <Element ul at 0x107941948>] [<Element div at 0x107941908>] ['item-0'] [<Element a at 0x1079418c8>] [<Element span at 0x107941948>] [<Element a at 0x1079418c8>] [<Element li at 0x107941948>, <Element li at 0x107941988>, <Element li at 0x1079419c8>, <Element li at 0x107941a08>]