使用python的lxml库解析html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
import urllib2
import lxml.html as H

def getjarinfo(url):
    c=urllib2.urlopen(url)
   
    f=c.read()
    doc = H.document_fromstring(f)
    tables=doc.xpath("//table[@id='download']")    
    pinpais=doc.xpath("//td[@id='music']")
    jixings=doc.xpath("//div[@id='game']")
    jars = doc.xpath("//table[@id='download']//tr[2]/td[1]/a[1]")
    for j in range(len(pinpais)):
      print jars[j].get('href')
      print pinpais[j].text_content()        
      print jixings[j].text_content()
    e=doc.xpath(u"//div[text()='%s']" % u"游戏")
    describe=e[0].getnext().text_content()
    #r = doc.xpath("//table[@id='download']//tr[2]/td[1]/a[1]")[0]
    #jarurl=r.get('href')
   
if __name__ == '__main__':
    url='http://google.com/'
    getjarinfo(url)
Share

Related Posts

0 Responses to “使用python的lxml库解析html”


  • No Comments

Leave a Reply