1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
| # -*- encoding:UTF-8 -*-
'''
This is geturl.
Wirtten by yuzebin : yuzebin@gmail.com
Important:this script is running in cygwin or linux,if you run at windows
you need the curl and wget for windows .
'''
class='''
CGetPage is charge of to get a url , it have three methods to get a page : urllib,curl and wget;
CParsePage is charge of to parse the page , and return the match;
CGetMatch is the forcad class to wrap the CGetPage and CParsePage.
'''
history='''
2006.07.10 version 0.0.0.9 :
Publish this code to internet , ;-)
2006.06.27 version 0.0.0.7 :
refrectoring class CParsePage : return re.match only
refrectoring class CGetCount : rename to CGetMatch
2006.06.26 version 0.0.0.3 :
modify class CParsePage , return re.match
2006.06.22 version 0.0.0.2 :
add class CGetCount
this version is the first workable version.
add cnsky.
2006.06.21 initial version 0.0.0.1 :
add class CGetPage and CParsePage
cannot work ;-)
'''
import string,re,os,fnmatch,sys,copy,gzip,time,datetime,urllib
from types import *
isDebugMode = False
funcUrlRead = lambda url: urllib.urlopen(url).read()
def funcOutputMessage(msg):
print str(msg)
def funcDebugInfo(msg):
if(isDebugMode==True):
print str(msg)
class CGetPage:
def __init__(self,url):
if self.urlCheck(url)==True:
self.url=url
else:
return None
def urlCheck(self,url):
#todo , check the url is valid url.
return True
def getPage(self):
self.page = funcUrlRead(self.url)
def curlPage(self):
#call curl to get a page,this requir curl is installed.
self.page = os.popen("curl -A "" -s "" + self.url + """).read()
def setPath(self,path):
self.path = path
def wgetPage(self):
#call wget to download a url to path,this requir wget is installed.
os.chdir(self.path)
os.system('wget -c ' + self.url)
class CParsePage:
def __init__(self,rule,page):
if (self.ruleCompile(rule)!=False):
self.page = page
else:
return None
def ruleCompile(self,rule):
#compile the rule
try:
self.rule = re.compile(rule)
except:
return False
def parsePage(self):
self.match = re.search(self.rule,unicode(self.page,self.getCharset(self.page)))
funcDebugInfo(type(self.match))
def getCharset(self,string):
import chardet
#todo : automatic discern the charset
charset = chardet.detect(string)
return charset['encoding']
class CGetMatch:
def __init__(self,url,rule):
self.url = url
self.rule = rule
self.cgetpage = CGetPage(self.url)
self.cgetpage.getPage()
self.page = self.cgetpage.page
self.cparsepage = CParsePage(self.rule,self.cgetpage.page)
def getMatch(self,url,rule):
self.url = url
self.rule = rule
self.cgetpage.url = url
self.cparsepage.rule = rule
self.cgetpage.getPage()
self.page = self.cgetpage.page
self.cparsepage.page = self.cgetpage.page
self.cparsepage.parsePage()
self.match = self.cparsepage.match
if __name__ == '__main__':
funcOutputMessage('===This is a get url script===')
runTest()
def runTest():
#initialization
ccount = CGetMatch('http://www.sina.com.cn','')
i=0
#1
try:
sitename = 'huajun'
rule = 'hit[587]='47588,([0-9]+)'
url = 'http://www.onlinedown.net/soft/hitjs/hits47.js'
i += 1
ccount.getMatch(url,rule)
funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
except:
pass
#2
try:
sitename = 'skycn'
rule = u'下载次数:</b> ([0-9]+)'
url = 'http://www.skycn.com/soft/23265.html'
i += 1
ccount.getMatch(url,rule)
funcOutputMessage(str(i).rjust(2) + '.' + sitename.ljust(12) +':' + str(ccount.match.group(1)))
except:
pass |
Recent Comments