- 论坛徽章:
- 0
|
# 请直接输入70925,那么只采二条作测试
# -*- coding: cp936 -*-
import urllib,urllib2
import time,random
import re
lastnumber=raw_input('上个advertisement ID是: ')
outfname=('.\\advertisement.txt')
outfile=open(outfname,'w')
IDnumber=int(lastnumber)
NotFound=0
MaxIDnumber=999999
results=[]
initUrl='http://app1.sfda.gov.cn/datasearch/face3/content.jsp?tableId=39&tableName=TABLE39&tableView=药品广告&Id='
def fetch(url):
while True:
try:
temp=[]
outtemp=[]
i=1
page=urllib.urlopen(url).read()
pat=re.compile('listmain(.*)<td onclick',flags=16)
cont=pat.search(page).group(0)
cont=cont.replace('<br>',' ')
cont=cont.replace('</td>',' </td>')
cont=cont.replace('\n',' ')
cont=cont.replace('\r',' ')
cont=cont.replace('</a>',' ')
contfinal=cont.replace('null);">','null);">width=83%>')
tmp=contfinal.split('width=83%>')
for i in range(1,len(tmp)):
temp=tmp[i].split('</td></tr>')
outtemp.append(temp[0])
i=i+2
return (outtemp)
except:
print 'A,O'
time.sleep(random.choice(range(5,15)))
def lastlet(s):
s=s.strip()
ll=s[-1]
for let in s[-1::-1]:
if let.isdigit():
continue
else:
ll=let
break
return ll
for IDs in range(IDnumber,MaxIDnumber):
urls=initUrl+str(IDs)
print urls
results=fetch(urls)
# print results
if IDs%100==0:
print 'I am resting'
time.sleep(random.choice(range(1,5)))
if results:
# CertificationNo=results[0]
# CertificationType=lastlet(CertificationNo)
# if CertificationType=='H':
# extra='Chemical'
# elif CertificationType=='S':
# extra='Biological'
# elif CertificationType=='Z':
# extra='TCM'
# elif CertificationType=='F':
# extra='Accessory'
# else:
# extra='Packing material'
results_tmp=[x for x in results]
extra=str(IDs)
results_tmp.append(urls)
NotFound=0
outfile.write('\t'.join(results_tmp))
outfile.write('\n')
else:
NotFound=NotFound+1
if NotFound>100:
if IDs<10000:
NotFound=0
else:
break
outfile.close()
print 'Congratulations, work done!'
raw_input('Press return to finish.')
# 请帮我修改一下,谢谢了 |
|