- 论坛徽章:
- 0
|
第一次写python,模仿别人的抓取代码修改的。把猫眼电影数据抓到本地的mongodb中去,本机设置每半小时执行一次调用bat脚本,可以基于数据查看前十票房变化,分析其他问题。个人爱好,很粗糙请见谅。.
[Python]代码- import re
- import urllib.request
- import time
- import pymongo
-
- def GetHtmlCode(url):
- page=urllib.request.urlopen(url)
- htmlCode=page.read().decode('utf-8')
- #fileObj=open('C:\\Users\\Jason\\Desktop\\wy.txt','w',encoding='utf-8')
- #fileObj.write(htmlCode)
- #fileObj.close()
- hc=htmlCode
- return hc
-
- class MovieData:
- MovieName=''
- MovieMoney=''
- MovieMP=''
- MovieDP=''
- MovieAllMoney=''
- def __init__(self,mn,mm,mmp,mdp,mam):
- self.MovieName=mn
- self.MovieMoney=mm
- self.MovieMP=mmp
- self.MovieDP=mdp
- self.MovieAllMoney=mam
- def print(self):
- tm1=time.strftime('%Y-%m-%d %H:%M:%S')
- tm2=str(tm1)
- print('电影名称: %s \n 票房: %s万元 \n 票房占比: %s \n 屏幕占比: %s \n 总收入: %s万元 \n 收集时间 is : %s '%(self.MovieName,self.MovieMoney,self.MovieMP,self.MovieDP,self.MovieAllMoney,tm2))
- #return ('电影名称: %s \n 票房: %s万元 \n 票房占比: %s \n 屏幕占比: %s \n 总收入: %s万元 \n 收集时间 is : %s '%(self.MovieName,self.MovieMoney,self.MovieMP,self.MovieDP,self.MovieAllMoney,tm2))
- def get(self):
- tm1=time.strftime('%Y-%m-%d %H:%M:%S')
- tm2=str(tm1)
- return ('电影名称: %s \n 票房: %s万元 \n 票房占比: %s \n 屏幕占比: %s \n 总收入: %s万元 \n 收集时间 is : %s '%(self.MovieName,self.MovieMoney,self.MovieMP,self.MovieDP,self.MovieAllMoney,tm2))
- def GetDataBlock(pagecode):
- DataBlock_re = r'<ul class="canTouch[\s\S]+?</ul>'
- DBre=re.compile(DataBlock_re)
- DBList=re.findall(DBre, pagecode)
- #print(DBList[0])
- #num=0
- #for i in DBList:
- # num=num+1
- # print(str(num)+'----------------------')
- # print(i)
- return DBList
- def MakeDataInfo(block):
- b1_re=r'<li class=\'c1\'>[\s\S]+?</li>'
- b1=re.search(b1_re, block).group()
- b1=re.sub(r'<li[\s\S]+?<b>','',b1)
- b1=re.sub(r'</b>[\s\S]+?</li>','',b1)
- #print(b1)
- b2_re=r'<li class="c2">[\s\S]+?</li>'
- b2=re.search(b2_re, block).group()
- b2=re.sub(r'<li[\s\S]+?<b>','',b2)
- b2=re.sub(r'</b>[\s\S]+?</li>','',b2)
- #print(b2)
- b3_re=r'<li class="c3">[\s\S]+?</li>'
- b3=re.search(b3_re, block).group()
- b3=re.sub(r'<li class="c3">','',b3)
- b3=re.sub(r'</li>','',b3)
- #print(b3)
- b4_re=r'<li class="c4">[\s\S]+?</li>'
- b4=re.search(b4_re, block).group()
- b4=re.sub(r'<li class="c4">','',b4)
- b4=re.sub(r'</li>','',b4)
- # print(b4)
- b5_re=r'<li class="c5">[\s\S]+?</li>'
- b5=re.search(b5_re, block).group()
- b5=re.sub(r'<li class="c5">','',b5)
- b5=re.sub(r'<span[\s\S]+?</li>','',b5)
- b5=re.search(r'[\d]+', b5).group()
- #print(b5)
- return MovieData(b1,b2,b3,b4,b5)
-
- client=pymongo.MongoClient('localhost',27017)
- db=client.test
- def insert(name,income,rate1,rate2,alincome,time):
- db.movie.insert_one({"name":name,"income":income,"mrate":rate1,"drate":rate2,"allincome":alincome,"time":time})
-
-
- tm1=time.strftime('%Y-%m-%d %H:%M:%S')
- tm2=str(tm1)
- pc=GetHtmlCode('http://piaofang.maoyan.com/')
- #print(pc)a
- bl=GetDataBlock(pc)
- mdata=''
- for i in bl:
- mv=MakeDataInfo(i)
- insert(mv.MovieName,mv.MovieMoney,mv.MovieMP,mv.MovieDP,mv.MovieAllMoney,tm2)
- #insert(mv,tm2)
- mv.print()
- mdata=mdata+mv.get()
- fileObj=open('C:\\Users\\Jason\\Desktop\\wy.txt','a',encoding='utf-8')
- fileObj.write(mdata)
- fileObj.close()
复制代码 |
|