专栏中心

EEPW首页 > 专栏 > 扣丁学堂Python培训简述用Python编写网页爬虫爬取oj

扣丁学堂Python培训简述用Python编写网页爬虫爬取oj

发布人：扣丁学习时间：2020-11-24 来源：工程师

加入技术交流群
- 扫码加入
  和技术大咖面对面交流
  海量资料库查询

发布文章

　　今天我们给大家讲解一下关于Python爬取网页的源代码，直接是干货，在代码中有不足之处，或者专业的Python高级开发程序员大神有更便捷的方式，我们可以共享。

　　#-*-coding:cp936-*-

　　importurllib2

　　importurllib

　　importre

　　importthread

　　importtime

　　importcookielib

　　cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar())

　　opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler)

　　urllib2.install_opener(opener)

　　#以下是正則表達式部分,意在过滤爬取页面的标签信息

　　classTool:

　　A=re.compile(";")#A-J对标签进行匹配

B=re.compile(" ")

C=re.compile("<;")

　　D=re.compile(">;")

　　E=re.compile("";")

　　F=re.compile("&")

　　G=re.compile("TimesNewRoman">")

H=re.compile("")

　　I=re.compile("'")

　　J=re.compile(r'语言.*?face=')

　　defreplace_char(self,x):#将标签内容替换成目标内容

　　x=self.A.sub("",x)

　　x=self.B.sub("",x)

x=self.C.sub("<",x)

　　x=self.D.sub(">",x)

　　x=self.E.sub(""",x)

　　x=self.F.sub("&",x)

　　x=self.G.sub("",x)

　　x=self.H.sub("",x)

　　x=self.I.sub("'",x)

　　x=self.J.sub("",x)

　　returnx

　　classHTML_Model:

　　def__init__(self,u,p):

　　self.userName=u#username与password等登入信息

　　self.passWord=p

　　self.mytool=Tool()

　　self.page=1#从代码页的第一页開始爬

　　self.postdata=urllib.urlencode({

　　'userName':self.userName,

　　'password':self.passWord

　　})

　　defGetPage(self):

　　myUrl="http://acm.njupt.edu.cn/acmhome/login.do"

　　#请求包括****和登入表单

　　req=urllib2.Request(

　　url=myUrl,

　　data=self.postdata

　　)

　　#此次对应为打开这个url

　　myResponse=urllib2.urlopen(req)

　　#读取页面

　　myPage=myResponse.read()

　　flag=True

　　#当flag为true时继续抓取下一页

　　whileflag:

　　#下一页****

　　myUrl="http://acm.njupt.edu.cn/acmhome/showstatus.do?problemId=null&contestId=null&userName="+self.userName+"&result=1&language=&page="+str(self.page)

　　#print(myUrl)

　　myResponse=urllib2.urlopen(myUrl)

　　#打开下一页的页面

　　myPage=myResponse.read()

#正則表達式搜索是否还有下一页,更新flag.原理为在当前页查找,假设当前页面有提交的代码,则含有相似"<ahref="/acmhome/solutionCode.do?id=4af76cc2459a0dd30145eb3dd1671dc5"target="_blank">G++</a>"这种标签.也就是说假设我的代码仅仅有84页,那么则在第85页flag-false,不再訪问86页

st="<ahref=.*?G++"

　　next=re.search(st,myPage)

　　#print(st)

　　print(next)

　　ifnext:

　　flag=True

　　print("True")

　　else:

　　flag=False

　　print("False")

　　#print(myPage)

　　#找到当前页面下全部题目代码的连接,放在myItem这个list中

myItem=re.findall(r'<ahref="/acmhome/solutionCode.do?id=.*?"',myPage,re.S)

　　foriteminmyItem:

　　#print(item)

　　#对于每一个题目代码连接,訪问其所在页面

　　url='http://acm.njupt.edu.cn/acmhome/solutionCode.do?id='+item[37:len(item)-2]

　　#print(url)

　　myResponse=urllib2.urlopen(url)

　　myPage=myResponse.read()

mytem=re.findall(r'语言.*?.*?TimesNewRoman">.*?',myPage,re.S)

　　#print(mytem)

sName=re.findall(r'源码--.*?</strong',myPage,re.S)

　　#sName=sName[2:len(sName)]

　　forsnameinsName:

　　print(sname[2:len(sname)-8])

　　#sname中包括了题号信息

　　f=open(sname[2:len(sname)-8]+'.txt','w+')

　　#通过前面的标签过滤函数,将过滤后的代码写在文件中

　　f.write(self.mytool.replace_char(mytem[0]))

　　f.close()

　　print('done!')

　　self.page=self.page+1

　　printu'plzinputthename'

　　u=raw_input()

　　printu'plzinputpassword'

　　p=raw_input()

　　#u="B08020129"

　　#p=*******"

　　myModel=HTML_Model(u,p)

　　myModel.GetPage()

　　如今这个代码有两个问题:

　　首先,在标签匹配的时候没有支持多行,也就是爬下来的代码中仍然包括跨度多行的标签,纯代码仍然须要人工提取.

　　第二,由于代码页面并没有问题的题目信息,所以仅以题号作为文件名称.这样若果升级后的OJ题目顺序发生改变,将无法将题目与代码进行相应.

　　针对第一个问题,修正的方法比較简单:

　　在正則表達式匹配的时候,将第二个參数位置加上re.DOTALL就可以.

　　比如:

　　J=re.compile(r'语言.*?face=',re.DOTALL)

　　对于第二个问题,能够依据题号寻找题目的页面(而非此前代码的页面),然后从题目页面中提取标题信息.

在题目页面中,我发现仅仅有标题是用标签修饰的,所以能够这样匹配

sName2=re.findall(r'([^<]+)',myPage2,re.S)

　　另外文件命名的时候不能够有空格,所以还要滤除空格

　　sname2=sname2.replace("","")

　　即使这样,有时在创建文件时仍然会抛出异常,可是又一次运行一次可能就会不再出现故障.

　　以下是晚上后的代码,改动的地方加粗了.

　　#-*-coding:cp936-*-

　　importurllib2

　　importurllib

　　importre

　　importthread

　　importtime

　　importcookielib

　　cookie_support=urllib2.HTTPCookieProcessor(cookielib.CookieJar())

　　opener=urllib2.build_opener(cookie_support,urllib2.HTTPHandler)

　　urllib2.install_opener(opener)

　　classTool:

　　A=re.compile(";")

B=re.compile(" ")

C=re.compile("<;")

　　D=re.compile(">;")

　　E=re.compile("";")

　　F=re.compile("&")

　　G=re.compile(""TimesNewRoman">")

H=re.compile("")

　　I=re.compile("'")

　　J=re.compile(r'语言.*?face=',re.DOTALL)

　　defreplace_char(self,x):

　　x=self.A.sub("",x)

　　x=self.B.sub("",x)

x=self.C.sub("<",x)

　　x=self.D.sub(">",x)

　　x=self.E.sub(""",x)

　　x=self.F.sub("&",x)

　　x=self.G.sub("",x)

　　x=self.H.sub("",x)

　　x=self.I.sub("'",x)

　　x=self.J.sub("",x)

　　returnx

　　classHTML_Model:

　　def__init__(self,u,p):

　　self.userName=u

　　self.passWord=p

　　self.mytool=Tool()

　　self.page=81

　　self.postdata=urllib.urlencode({

　　'userName':self.userName,

　　'password':self.passWord

　　})

　　defGetPage(self):

　　myUrl="http://acm.njupt.edu.cn/acmhome/login.do"

　　req=urllib2.Request(

　　url=myUrl,

　　data=self.postdata

　　)

　　myResponse=urllib2.urlopen(req)

　　myPage=myResponse.read()

　　flag=True

　　whileflag:

　　myUrl="http://acm.njupt.edu.cn/acmhome/showstatus.do?problemId=null&contestId=null&userName="+self.userName+"&result=1&language=&page="+str(self.page)

　　#print(myUrl)

　　myResponse=urllib2.urlopen(myUrl)

　　myPage=myResponse.read()

st="<ahref=.*?G++"

　　next=re.search(st,myPage)

　　#print(st)

　　print(next)

　　ifnext:

　　flag=True

　　print("True")

　　else:

　　flag=False

　　print("False")

　　#print(myPage)

myItem=re.findall(r'<ahref="/acmhome/solutionCode.do?id=.*?"',myPage,re.S)

　　foriteminmyItem:

　　#print(item)

　　url='http://acm.njupt.edu.cn/acmhome/solutionCode.do?id='+item[37:len(item)-2]

　　#print(url)

　　myResponse=urllib2.urlopen(url)

　　myPage=myResponse.read()

mytem=re.findall(r'语言.*?.*?TimesNewRoman">.*?',myPage,re.S)

　　#print(mytem)

sName=re.findall(r'源码--.*?</strong',myPage,re.S)

　　#sName=sName[2:len(sName)]

　　forsnameinsName:

　　url2="http://acm.njupt.edu.cn/acmhome/problemdetail.do?&method=showdetail&id="+sname[8:len(sname)-8]

　　myResponse2=urllib2.urlopen(url2)

　　myPage2=myResponse2.read();

sName2=re.findall(r'([^<]+)',myPage2,re.S)

　　sname2=sName2[0]

　　sname2=sname2.replace("","")

　　#print(sName)

　　print(sname[8:len(sname)-8]+'.'+sname2[0:len(sname2)])

　　f=open(sname[8:len(sname)-8]+'.'+sname2[0:len(sname2)]+'.txt','w+')

　　f.write(self.mytool.replace_char(mytem[0]))

　　f.close()

　　print('done!')

　　print(self.page)

　　self.page=self.page+1

　　#printu'plzinputthename'

　　#u=raw_input()

　　#printu'plzinputpassword'

　　#p=raw_input()

　　u="LTianchao"

　　p="******"

　　myModel=HTML_Model(u,p)

　　myModel.GetPage()

　　以上代码学习仅供参考学习，有不足的地方希望能够提出并给出修改意见。或者可以加我们的Python学习交流群：816572891。我们扣丁学堂不仅有专业的讲师，而且还有配套的学习资料和直播、录播的视频教学，同事还有班主任和助教老师的全程指导。如果感兴趣的话，可以直接加入到我们的团队中来一起探讨学习。

专栏文章内容及配图由作者撰写发布，仅供工程师学习之用，如有侵权或者其他违规问题，请联系本站处理。联系我们

关键词：