def downloadAll(self):
global g_toDlUrl
global g_totalcount
i = 0
while i < len(g_toDlUrl):
j = 0
while j < self.threadNumber and i + j < len(g_toDlUrl):
g_totalcount += 1 #进入循环则下载页面数加1
self.download(g_toDlUrl[i+j],str(g_totalcount)+'.htm')
print 'Thread started:',i+j,'--File number = ',g_totalcount
j += 1
i += j
for th in self.threadPool:
th.join(30) #等待线程结束,30秒超时
self.threadPool = [] #清空线程池
g_toDlUrl = [] #清空列表
def updateToDl(self):
global g_toDlUrl
global g_dledUrl
newUrlList = []
for s in g_pages:
newUrlList += GetUrl.GetUrl(s) #######GetUrl要具体实现
g_toDlUrl = list(set(newUrlList) - set(g_dledUrl)) #提示unhashable
def is_sep(ch):
for c in urlSep:
if c == ch:
return True
return False
def find_first_sep(i,s):
while i < len(s):
if is_sep(s[i]):
return i
i+=1
return len(s)
def GetUrl(strPage):
rtList = []
for tag in urlTag:
i = 0
i = strPage.find(tag, i, len(strPage))
while i != -1:
begin = i
end = find_first_sep(begin+len(tag),strPage)
rtList.append(strPage[begin:end])
i = strPage.find(tag, end, len(strPage))