Linux_manne
发表于 2014-12-02 15:34
本帖最后由 Linux_manne 于 2014-12-02 15:58 编辑
各文件 50w测出来2.4 秒 请楼主帮忙验证 后续看看还有无优化
后续更新def fequal(c):
with open('result1','a') as f:
f.writelines(c+"\n")
def fbigger(c):
with open('result2','a') as f:
f.writelines(c+"\n")
def fsmaller(c):
with open('result3','a') as f:
f.writelines(c+"\n")
def fonly(fname,c):
with open(fname,'a') as f:
f.writelines(c+"\n")
f1 = open('file1','r')
f2 = open('file2','r')
d1 = dict(line.split() for line in f1)
d2 = dict(line.split() for line in f2)
x = set(d1.keys())
y = set(d2.keys())
tmp1=''
tmp2=''
tmp3=''
tmp4=''
tmp5=''
for k in x-y:
tmp1 += "%s only file1 %s\n" %(k,d1)
fonly('result4',tmp1)
for k in y-x:
tmp2 += "%s only file2 %s\n" %(k,d2)
fonly('result5',tmp2)
for k in x&y:
if k in d2.keys():
if d1 == d2:
tmp3 += "%s equal %s\n" %(d1,d2)
if d1 > d2:
tmp4 += "file1 bigger file2\n"
if d1 < d2:
tmp5 += "file2 bigger file1\n"
fequal(tmp3)
fbigger(tmp4)
fsmaller(tmp5)1.8秒
love_shift
发表于 2014-12-02 16:47
#!/usr/bin/env python
#
#
from threading import Thread
class GetFileThread(Thread):
def __init__(self, setList, method):
self.setList = setList
self.method = method
super(GetFileThread, self).__init__()
def run(self):
if self.method == "equal":
for key in self.setList:
if d1 > d2:
r2.write('%d file1biggerfile2 %d\n' % (key,d1))
elif d1 < d2:
r3.write('%d file2biggerfile1 %d\n' % (key,d2))
else:
r1.write('%d equal %d\n' % (key,d1))
elif self.method == "inf1":
for key in self.setList:
r4.write('%d onlyinfile1 %d\n' % (key,d1))
elif self.method == "inf2":
for key in self.setList:
r5.write('%d onlyinfile2 %d\n' % (key,d2))
r1 = open('result1','w+')
r2 = open('result2','w+')
r3 = open('result3','w+')
r4 = open('result4','w+')
r5 = open('result5','w+')
d1 = dict([(int(line.split()),int(line.split())) for line in open('t1','r')])
d2 = dict([(int(line.split()),int(line.split())) for line in open('t2','r')])
s1 = set(d1)
s2 = set(d2)
threads = []
t1 = GetFileThread(s1&s2,'equal')
threads.append(t1)
t1.start()
t2 = GetFileThread(s1-s2,'inf1')
threads.append(t2)
t2.start()
t3 = GetFileThread(s2-s1,'inf2')
threads.append(t3)
t3.start()
for t in threads:
t.join()
r1.close()
r2.close()
r3.close()
r4.close()
r5.close()
:mrgreen:加个线程,大文件时候看下是否有效果!
caoshanhu
发表于 2014-12-02 17:57
我是来看看,学习的
Hadron74
发表于 2014-12-03 16:39
本帖最后由 Hadron74 于 2014-12-05 10:48 编辑
这两个文件如果键值是排序好的,是一个归并排序的问题;如果没有排序好,应先排序,再归并,算法复杂度log(n)+log(m)+m+n.
用集合的运算时间复杂度太大了。
这里给出一个用Python的代码:__author__ = 'luhongc'
# -*- coding: utf_8 -*-
file1 = """12212453
12235687
12437683
12451000
"""
file2 ="""1221 2453
1223 2000
1245 5612
1265 8000
1287 4321
"""
#file1 = open("file1").read()# 对外部文件的输入
#file2 = open("file2").read()
INfile1 = file1.strip().split('\n')
INfile2 = file2.strip().split('\n')
Key1 = [ [ int(i) for i in kv.split()]for kv in INfile1]
Key2 = [ [ int(i) for i in kv.split()] for kv in INfile2]
Key1.sort(key=lambda x: x)# 队列按键排序,如果排序好文件,可以省略这步
Key2.sort(key=lambda x: x)
class DATA:
def __init__(self,Key,KeyOther,fonly,othername):
self.Key = Key # 本队列
self.KeyOther = KeyOther # 另一个队列
self.fonly = fonly # 另一个队列的输出文件句柄
self.othername = othername # 另一个队列的名字
self.head = 0 # 对列
def get(self,other_header):
try:
t,v = self.Key
self.has_data = True # 是否能 得到 一个 数据
self.t = t # 键
self.v = v # 值
except:
# 如果一个 序列 为空,则把另一个序列的 值 输出到 指定 文件 中
for key,value in self.KeyOther:
self.fonly.write("%d onlyin%s %d\n" % (key,self.othername,value))
self.has_data = False# 返回 序列为空
self.t = "NA"
self.v = "NA"
return self
def pop(self):
self.head += 1 # 弹出本队列最小元素
return
with open("result1","w") as R1 , \
open("result2","w") as R2 , \
open("result3","w") as R3 , \
open("result4","w") as R4 , \
open("result5","w") as R5:
D1 = DATA(Key1,Key2,R5,"file2")
D2 = DATA(Key2,Key1,R4,"file1")
D1.get(D2.head)
D2.get(D1.head)
while True: #归并算法,总选取两个队列中小的键进行处理
if (D1.t > D2.t):
R5.write("%d onlyin%s %d\n" % (D2.t,"file2",D2.v))
D2.pop()
D2.get(D1.head)
if not D2.has_data:
break
elif (D1.t < D2.t):
R4.write("%d onlyin%s %d\n" % (D1.t,"file1",D1.v))
D1.pop()
D1.get(D2.head)
if not D1.has_data:
break
else:
if ( D1.v > D2.v ):
R2.write("%d file1biggerfile2 %d\n" % (D1.t, D1.v-D2.v))
elif (D1.v < D2.v):
R3.write("%d file2biggerfile1 %d\n" % (D1.t, D2.v-D1.v))
else:
R1.write("%d equal %d\n" % (D1.t, D1.v ) )
D1.pop()
D2.pop()
D1.get(D2.head)
if not D1.has_data:
break
D2.get(D1.head)
if not D2.has_data:
break
KoomIer
发表于 2014-12-04 17:03
已开奖
回复 1# KoomIer
jcdiy0601
发表于 2014-12-04 17:51
回复 15# KoomIer
:luya: 是说参与的都有奖吗
ssfjhh
发表于 2014-12-05 09:34
楼主,挺多参与者用的py3,也都很想看到结果如何,但是没有数据,你一句py3,no result,打击我们py3用户参与的积极性啊。
KoomIer
发表于 2014-12-05 09:45
对
回复 16# jcdiy0601
KoomIer
发表于 2014-12-05 09:47
这里我只能抱歉,而且我是新手,又没能力改3为2
如果有时间,你帮我改下
而且关键是算法思路
回复 17# ssfjhh
ssfjhh
发表于 2014-12-05 09:52
python3 balabala.py这样不行么?