- 论坛徽章:
- 0
|
先贴一下代码 周末闲着无聊 做了一个多线程下载songtaste音乐的脚本
如下- #!/usr/bin/python
- #coding:utf-8
- import re
- import urllib
- import urlparse
- import httplib
- import time
- import sys
- import os
- from threading import Thread
- songtype = {'7d99bb4c7bd4602c342e2bb826ee8777':'.wma','25e4f07f5123910814d9b8f3958385ba':'.Wma',
- '51bbd020689d1ce1c845a484995c0cce':'.WMA','b3a7a4e64bcd8aabe4cabe0e55b57af5':'.mp3',
- 'd82029f73bcaf052be8930f6f4247184':'.MP3','5fd91d90d9618feca4740ac1f2e7948f':'.Mp3'}
- outputdir = '/home/pxiaohai/Music/'
- class MultiThreadDown(Thread,urllib.FancyURLopener):
- def __init__(self,threadname,url,filename,ranges):
- Thread.__init__(self,name=threadname)
- urllib.FancyURLopener.__init__(self)
- self.name = threadname
- self.url = url
- self.filename = filename
- self.ranges = ranges
- self.downloaded = 0
- def run(self):
- try:
- self.downloaded = os.path.getsize(self.filename)
- print self.filename
- except OSError:
- self.downloaded = 0
- self.startpoint = self.ranges[0] + self.downloaded
- if self.startpoint >= self.ranges[1]:
- print 'part %s has been downloaded over.' % self.filename
- return
- self.oneTimeSize = 16384
- print 'task %s will download from %d to %d' % (self.name,self.startpoint,self.ranges[1])
- self.addheader("Range","bytes=%d-%d" % (self.ranges[0],self.ranges[1]))
- self.urlhandle = self.open(self.url)
- data = self.urlhandle.read(self.oneTimeSize)
- while data:
- filehandle = open(self.filename,'ab+')
- filehandle.write(data)
- filehandle.close()
- self.downloaded +=len(data)
- data = self.urlhandle.read(self.oneTimeSize)
- class DownSong():
- def __init__(self,ms):
- self.ms = ms
- def down(self):
- global outputdir
- global songtype
- for song in ms:
- suburl = 'http://www.songtaste.com/song/%s/' % (song[0])
- songname = song[1]
- m = re.compile('playmedia1\((.+?)\)')
- f = urllib.urlopen(suburl)
- data = f.read()
- s = []
- s = m.findall(data)[-1].replace('\'','').replace('"','').replace(' ','').split(',')
- print s
- print s[1]
- songname = songname + songtype.get(s[5])
- songurl = s[6] + s[2] + songtype.get(s[5])
- try:
- url1=urllib.urlopen(songurl)
- except:
- show = u'\r %s can not be downloaded' % (songname)
- print show
- continue
- blocks = 4
- filesize = self.GetUrlFileSize(songurl)
- ranges = self.SpliteBlocks(filesize,blocks)
- print ranges
- threadname = ["thread_%d" % i for i in range(0,blocks)]
- filename = ["tmpfiles_%d" % i for i in range(0,blocks)]
- tasks = []
- for i in range(0,blocks):
- task = MultiThreadDown(threadname[i],songurl,filename[i],ranges[i])
- task.setDaemon(True)
- task.start()
- tasks.append(task)
- time.sleep(1)
- songname = songname.decode('gb18030','replace').strip()
- songname = ''.join(songname.split())
- show = u'\r %s is starting download' % (songname)
- print show
- while self.islive(tasks):
- downloaded = sum([task.downloaded for task in tasks])
- process = downloaded/float(filesize)*100
- show = u'\rFilesize:%d Downloaded:%d Completed:%.2f%%' % (filesize,downloaded,process)
- sys.stdout.write(show)
- sys.stdout.flush()
- time.sleep(0.5)
- show = u'\r %s completed' % (songname)
- print show
- songname = outputdir + songname
- filehandle = open(songname,'wb+')
- for i in filename:
- f = open(i,'rb')
- filehandle.write(f.read())
- f.close()
- try:
- os.remove(i)
- print ""
- except:
- pass
- filehandle.close()
- def GetUrlFileSize(self,url):
- urlHandler = urllib.urlopen(url)
- headers = urlHandler.info().headers
- length = 0
- for header in headers:
- if header.find('Length') != -1:
- length = header.split(':')[-1].strip()
- length = int(length)
- return length
- def SpliteBlocks(self,totalsize, blocknumber):
- blocksize = totalsize/blocknumber
- ranges = []
- for i in range(0, blocknumber-1):
- ranges.append((i*blocksize, i*blocksize +blocksize - 1))
- ranges.append(( blocksize*(blocknumber-1), totalsize -1 ))
- return ranges
- def islive(self,tasks):
- for task in tasks:
- if task.isAlive():
- return True
- return False
- if __name__ == '__main__':
- # if len(sys.argv) < 1:
- # print u'please input the url'
- # else:
- # if not re.match("^https?://[^ ]+",sys.argv[0]):
- # print u'please input the url like "http://www.songtaste.com/music/catsong/cat2/"'
- m = re.compile(r'(.+?)')
- #
- url = 'http://www.songtaste.com/music/catsong/cat2/'
- f = urllib.urlopen(url)
- data = f.read()
- ms = []
- ms = m.findall(data)
- ds = DownSong(ms)
- ds.down()
复制代码 用到了多线程下载 但是遇了问题
比如 我开4个线程 一个音乐文件大小为 4M 这样一个线程只下载1M 就可以了
但事实不是这样的 下载下来的文件 是 tmpfile_1 为 1M
tmpfile_2 为 2M
tmpfile_3 为 3M
tmpfile_4 为 4M
最终合成的一个音乐文件 变成了 10M
一开始以为自己的代码有问题 经过一一排查之后发现代码没有问题
我又写了一个测试文件用的- #!/usr/bin/python
- import urllib
- import time
- handler = urllib.FancyURLopener()
- handler.addheader("Range",'bytes=3000000-11356207')
- urlhandle = handler.open('http://224.cachefile20.rayfile.com/1871/zh-cn/download/f92b79ffbd15686f65aa203d305823af/preview.MP3')
- data = urlhandle.read(16384)
- while data:
- filehandle = open("a.mp3",'ab+')
- filehandle.write(data)
- filehandle.close()
- data = urlhandle.read(16384)
复制代码 文件总大小为 11356207 'bytes=3000000-11356207' 这里的 Range 是设置 为 3000000-11356207
和 5000000-11356207 的时候 下载下来的文件大小都是 11356207
这说明 bytes=startpoint-endpoint startpoint 没有起作用 而 endpoint 起作用了
我也从其他的地方下载文件 都没问题 但是 下载songtase 上的音乐文件就会这样
实在不知什么原因 不知道是不是 rayfile 搞了什么鬼 |
|