- 论坛徽章:
- 1
|
两道题,问了N多人,没结果,再问一下看看
今天起了个大早,不过还是比较失望....
to bioinfor :
我知道,我前面程序里说明了要真正符合你的要求要改两个数据,我只是用这个短的来测试一下的.再发一个有一个问题未处理的情况:- #include <iostream>;
- #include <fstream>;
- #include <string>;
- #include <vector>;
- #include <map>;
- #include <set>;
- #include <ctime>;
- #include <cstdlib>;
- #include <algorithm>;
- #include <utility>;
- using namespace std;
- const int SizeMax=10000;
- char RandChar();
- void RandString(string &);
- void OutputStatics(ofstream &,const string &,const map<string,vector<int>; >; &);
- void FindSameStr(const string &, string::size_type = 10);
- void FindSameReverseStr(const string &,string::size_type = 10);
- int main()
- {
- srand(time(0)); //应该放在这里(其它某些地方也可以),居然犯这种错误.:p
- string testString;
- RandString(testString);
- FindSameStr(testString);
- // FindSameReverseStr(testString);
- }
- char RandChar()
- {
- // srand(time(0)); //因为每次"种子一样"的话产生的第一个随机数就是一样
- switch(rand()%4)
- {
- case 0: return 'A';
- case 1: return 'C';
- case 2: return 'G';
- case 3: return 'T';
- default:
- cout << "Error and Exit Prog\n" << endl;
- exit(-1);
- }
- }
- void RandString(string &s)
- {
- for (int i=0; i<SizeMax; ++i)
- s.push_back(RandChar());
- }
-
- void OutputStatics(ofstream &out,
- const string &s,const map<string,vector<int>; >; &sameStr)
- {
- out << "Source String:\n" << s << "\n\n" << endl;
- map<string,vector<int>; >;::const_iterator iter = sameStr.begin(),
- iter_end = sameStr.end();
- out << "The Same String in the Source String:\n" << endl;
- while ( iter != iter_end) //不能直接简写为iter++,下面用到iter->;first
- {
- out << "\"" << iter->;first << "\"" << endl
- << " Length: " << (iter->;first).size() << endl
- << "The Start Pos: ";
- for (vector<int>;::const_iterator vi = (iter->;second).begin();
- vi != (iter->;second).end(); ++vi)
- out << *vi + 1 << " "; //不以0(以1)为字符位置的开始
- out << '\n' << endl;
- ++iter;
- }
- }
-
- void FindSameStr(const string &s, string::size_type minLen)
- {
- string::size_type idx = 0, pos = 0, maxLen = minLen;
- set<pair<string,string::size_type>; >; SetStr; //元素为字符串与开始查找的位置对
- string tempFound, moreLenFound, Found;
-
- while (1) //重要:前面的字符串没有同样的时候还需要找后面的,直到最后一个可能的字符串
- {
- tempFound = string(s,idx,minLen);
- //只要能找到一个同样的就行,注意是从该被找字符串之后开始找
- while ( (s.find(tempFound,idx+1)) != string::npos)
- {
- Found = moreLenFound = tempFound;
- //是否存在更长的同样的字符串
- while (1)
- {
- moreLenFound.push_back(s[idx + maxLen++ - 1]);
- //再也找不到更长的字符串了
- if (s.find(moreLenFound,idx + maxLen - minLen) == string::npos)
- {
- SetStr.insert(make_pair(Found,idx));
- idx += (maxLen - minLen);
- maxLen = minLen;
- tempFound = string(s,idx,minLen); //构造好下一个被找字符串
- break;
- }
- Found = moreLenFound; //保存最长同样的字符串
- }
- }
- if ( (++idx + 2*minLen) >; SizeMax )
- break;
- }
- map<string,vector<int>; >; mvs;
- //处理SetStr中的字符串,找到所有共同的字符串的起点
- for (set<pair<string,string::size_type>; >;::iterator iter = SetStr.begin();
- iter != SetStr.end(); ++iter)
- {
- pos = iter->;second; //不是从0开始找而是从被查找字符串的开始位置开始找
- while( (pos = s.find((*iter).first,pos)) != string::npos)
- {
- mvs[(*iter).first].push_back(pos);
- if((pos++) >;= SizeMax)
- break;
- }
- }
- ofstream coutstr("C:\\result.txt");
- OutputStatics(coutstr,s,mvs); //可以直接输出到cout上
- }
- void FindSameReverseStr(const string &s,string::size_type minLen)
- {
- //在适当的地方把被查找字符串反转一下,和上面的一样处理,也可以来个bool参数
- }
复制代码 我运行一次后的情况如下:Source String:
TAGCAGCAGGTTTACGGACACCCTCCTCTCGGCTACTGCTAGTGACGGGGCGCTGCGAGGCGCAGACGGCAAGGTTCTAGCCAAACGTATGCATCGATATTTTGAGAGAGAACGGGGCCTCCTTCCTACAGAAACGTGGATGGGACGATAAAGAAGGAGATTGGATTGCCAAAGCAACATTGTGAGCAAATAAACAGGCATATGGTCATACGGTCGCATCTCCATGGGATGAGTGGCGTAAACCCGTTTGGATGCGAGGCCTGAAGGCCCTTTGTCGCACAACCCCTAGACACGTGTCTGCACGAGACGGCCGGAATAGCTAGTATGGGAAGTCTAGAAGGGGGTTCGTAGGGGGGGAGGAGCATGACTTGGCCGACAGGTCCGTCAGTTAATATATAGCTAGTGGGATCTACGACGGCGTCTTGGGGATCCAACATTCACGCAACGTTCATGGCACACTGTTATCTGTGATTTATAACCTGGTCCGGACCGCACTATTACAGGCTTCAACGACTAGAACTTCAGCGATAGGTCATTAGAGCTGTCGGGCACGAGTTTTCCGTCACCCCAGGCGTGCATTGCATCCACGGTGAGTTATGATATGCCACAATCAAAGAGGGCGACGACTTATCGTCCGATTGGGAGGATAGAAGTGACAGGCCTTATTCAATCCCTGTGTATTACTAGTTGGGTATTGGCAACCGTTTCGGCTGTTAGTAATCAGTCGTCCAGATAGTGGCCATAGTAGGGTCATCGGGCCTAAGCCCGATTATCCCCTCGTGACAGATCCTCGTAAAAACAGGAAAGACTGACGGCGTGAGTCCAGAATCATGTAAATTCCGACAAGAGCGCGATTTAGTACGCGGCCTATGACTTAGTGAGCTGAAGGGAACTCCAGACCCTCAAGACCGAAATCCTCTCAGTAGGCACGTGAGCATCATCTAGATTGACCAGCCACACGCGAGCCTAGGGGGCAGGCGTACATAAAACAATGCGGGAGACGGTACAAGTCTAACGCCTGGCGCGTAAGGGATAAAGGTCAGAGCTGCTCTCAATCATGCAATACGCGCCCTTCTAGCCAAGGCTCAATCCGCCGATAAGCCGAAGTCGTGTCATGGTAAATACCGGCCTCGGTTGTCTCACAGCTTATTTTTGTAAGTCTTCAATTTATAACGTGGCATAAGTACTGCGGCATTTGAGCTACGCTTCGCCTGTAGGCTGCCGGCATGGGAGTTGTATCAGCTCATCTTCTGCAAGTCGTGTGCCCTACTAAGGATACGCCTTCTTGTTTGCTAACGCCTGGATGTTGACGGCTATTCGTAGGACAAATATGCCGTGAATCGCCTTCATTTACCTATCACTCCTTACTCTAATCTGGATACAGCCATCGTTACCAAAGCTAGGTGCCCAACTTCGCCACTTTGCATTTTTGGTCGTCTGGTTTCTGTGCCCGACTACCGGGCCGGCCTGGTCACCGCGGTCGATCACGAAGATCCGCCCCTTGTTAATTCGAAACAAACCCGGATGTGTTCACCCGTTTGTATCAGCGTTCGTTGAGGGGGAGAACATGAGATTCGTCGGATAGATGTTCGGAGGCTTGTGAGGCCCAGCGGCAGTGATCAATTGCATAGTAAGGGGTGCCCCAGTAGGCAGTTGTTCCGCTTAATGGCAAGTCCGCTCAAAGGCCGTCAAGTCTCGGTCACATGTAGCCCATCTGGTTAGCCTGCCCCAAGCGCCAAGTCCCCCGGAGAGAAGAGGGAGTACTCGAGGTCCGGACAAACCCGCAAGCGCCGGTTTACTAAGTTACGTCGTGGCTCCGGCAGGATCCGGCACGCTTTCTTACGCGCGGGAATGAGAGCTATTGTTCGAGTACCACACCGGGACCCAGGGTCCGGGCAGGGTTACTCCCTATTGAACCCCGCCACATTTGTGCAAATAACGAGGATACCCATACATACCCAGATGGGTTGGCCTAGCGCCTGAAATTTGCGCGTTTTAGCCGTCACGTGGGTGCGCGTAGAAGCAACCCAAGCAGGGGACTCCTGAGCTCTGACTAGACCCATTATAATGACACTCTTTCCACTTGGTCACACGATACACTCCGCAGGCACTCCGTGCCATACCGATGGATTGGCCGTGCAGCCACCATTAGCAGACTGCTCCCATCCACTTAATTACTGATTCTAAGCGATCACTCGAGCATTCTGTTATTATCGGTGGCTTAGATGCAGAGGCCTTTTACGCGCGAAGCATAACCTACAACAACTCGTGGGTTAAGTTGGACGGGCGTACTAGCCGGATCGCTTTGCGAACAAAGCGAACGAAGATTACCATTTGCGCCCAGATACTCTTCCTACACTAGAAGGGATCTAATACGGCACCGAATGCAACTGTCGGGTCGTTAGTTGCATGATGGCGACTGCGAAGGTTCATGCTTGGAAGCGACAGATCGAACCGGTAACCAGTGTTAATTAACCAAGCTAACAAGAGCCTTGGTGCAGCGAGGTCTACCAAGCCATAGTTATGCAGTTCCCACCTTAGAATGGGTAGCAATTCAGGCAGTGGTCCACTGTGACCTCCCTTCGAGACTACTGGTTCGCTGAGAAGGGCGAGGCTAAGGACTCACACGCGTGTGATTCTGCGTGCGGGATGATAAACATCAGGCAAAGTGGTAAGCGGTATCCCGTGCTTCGCCGAAGGTTCCTTCCAGTCTTGGATTACGAATGAAACGTCCCTACCCCCGACGCCTTCCTAATGCGTTCCTTTGCAATCCTGCTGAGTCCCGTTGCTATTCGATGCGGGAGATATGATGAAGCACACTAGCGCTAAGGGGCTGTTTGATAATTAAATTCAATAGCAGGTCGAGCTACCTTGAAGTAGGACCATAACAGTCAACAACCCGGCGAGTTAGTGTGGTAAGCTAGCTGTGCAGAGTACGTAAGCAATCTCAACCTATGGCAGTGATAACTCGTTAGAAGTAGTACCCTGAGCCATGGCGATGGCCTAAAGGTCTCTTCAAATTACACACCGAGCAACTAAGGTGTGGCCCTAAAATCGCGACCTGACATCCGCCTACACTGCCGGCTCTGTCTATGATTCTATCGCCATGTCTCTATAGTCCGATGACTGAGGACCTTAGGGAGGCCGGTGATCTCTTTTAGCATCCATACTCGATGCTCTATGTAGATTCACCCGTTCTGGATAATTTGCCTCCCTATGTGTCAAGCCGGCCTCGAGCTGGGCCATGTTGTAAGGCAGCCTATTCTGATCTGGTGCAGCCAGGCGCGTTTCTATAAGAGAGAGTTCTAAGCTGATGGTGGTGCAGGGACGACCGAGTCCCGTTATCAACTGGGGTTGTCAGGAGTTTGAGTACCGCGGTGAGCTGGAAGAGTCATTAGATCTGCCATCTGAACATGCCTAAGCGGTCAACGGCTGCGTCAGAATGCCTCACGGACACCCGTGTGGTGTATCGTTGGCCTACACGGATCGAAACGTTTTTAAATAGATCATATCGCCTAGCCCCTTCTCATATAGGCTTTCGACTCCCGAGTCGACCTTTCCGGACATGTACCTACCCAGCTAGACAAAGTGGGGATTACGAGGCGTCACGATTGGTTTCACGACTCTCTACCTTCGCCAACAGGAGTAGTCTGATCAATCTCGTCCTGTCTCCGCCGGTCACTTTCAGCTTTCTCCACATCCGAGAACCATGTCTAGCGATCTAAGGGGTTCCTAAGCAACGGCCTTACATAAACTTCAGCGATAAGCGGCCGCCAAGCCTCTCCGGAACTCTAACGATATAGACATTGACGTCTTTACTGTCATTTTTGAATCTGACGAGTAATATTAGTCCATTCAAGATACACGGAGGCAAGGGGGAGATCATAAATACTAAAAGAAGACCATGAAGCGACTACTGCGATAGTAACGACATACGTATCGTGCGTCATCCGGAATATCGTTAATCAGGGCCACTTACATAGCATTAACGATTACCAAGAGCAACGCCAGCTGCTCCCAACATAGCCGCCTAAAATCTATCCCACCGTCGCGTGCCGGTCTGAGACAATACCGTTGCGTTTCGAATTGGATCGGAGGAAACATTGTAGCGACGTTCAATTCTGGGTTCCGAACATCGTGGGTAGATACGAAAAGGATGGCGTCGATATGTTAACTATGGAAATCTGGTAGAAGGGAGGGGGATGTCGCATAGAAGGGGTTGTCAGATATACAGGAGTGATTTTTTTTAATTACTGTCACAGGGGCAAGTCCATGGTCGGCGCCGCAGTGTTTCTACATGACGGGGTCTGACGCTCCGGCGAACAGCTTAGTTTAGTGTACGGGATCAGAGATAATCGCAGGGGTGACGACAGATCCACCTGAGGGTGCCCGACGTACCTTAGATCAATGAAGCTTTGACAGCCTATTGGACGGCAGCCCCTCGCCTTGAACGTAGGGCACTCGTCTCCACATCCGGGCTTTGCTGTAAATACCTTGGAGGCCTAGTATTCGGATCTAGTTGGTGAGTTGTTTGAAGGCCGGTCTTGCTCATGACAAATGGTCCTCGGATTAGCGTGAAGCACCCCCTACGATCCGGTCGGAGCTCGATTATGTGAATCAAGGGTGACAATGAAGCCGAATTTATATCTAGACAATAACTCAACGAGTATAAAGCGGATTGCAGATATTCCGCGCACATTAGCTTGCACTTGGGGGTTATCTTCAAGCTAACCCACCAAGGCGGCGACAATGCGACGAGTCTGGTCATCTCCTCCAATTGGCTAGAAATTGGAGCCGGGAGCCATCATATTCAACGGTGATACTGGAGGACAGATTCTGTTTATATATACTCGCACCCTGAGGTAGATTATCTTAGCTCTTAGGCAGATTAAGCCGACATATCAGTGCTTTTCCATGGACGGACCGCCCCGTAGCAGGACGCTCCTAATTGTAAGTGTGGCATTTTGGGCGAGTAATATGGTTGTTTAAAGTTAGACAGCGCGCTGTCTGCACGTTGCGTGTGTATACGTCCCTCACGAGCGCGTTAACGCCGGCGGTATTCAGCGCGCTGGGATATATAAAACTGCCGCTAGCTCCCGGCCGCAACTTTTATGATATGTAACGACCCCTACGTTAAGAGGAGGGTCATCCCCGCCGTGTCCGCCTGGGCTGACACCAGTTAGCTCTTGAAGGTAAATGATGTGCCGCACAAATACTGAGGAAGGCTGCTAACCTGCTATAGGTGAACGCAGATCTGTTCGCAAGGCGAACGTGCCGCCTTGGCGACGTATTCACTCCGTGACTGGCCGTACCATCTAAGTGAACTTGCCCAATGAGTCATCATGCTCGATTTTCGGTTAATGGAAGTCTCACCGTCTCGCGGGATTTTACCCATTTCCACGTTCCGTCTCTTGAGCACGGGCGACCCGAACAGACCGTGACAAAGGTCCAAGACGCCTATGGAACTATACCCAGGAATGGCATTTACATTGATCAACCGCTGTAGGTTGAGTCAGGAATCGCCAGGGAACTCTTACCCCCGATACCGCCCCATGCGCTCAGCCGTTACGGATAACCCGCCGCGCGGAAGACACCGTAAAGCGTTGACGAAGGCCTATAACGCTAAGCGGGTCCCGGTTGTAGCCAAGGCCACCACAGATGCGCGAGCCGGCATCTACTCCGACATACAAGTTCATGACCGAAACGCTAGCAGAGGGTTTAGCCCCTATGACTCGGGCCCGAACGATTTGCGAAACAGAATGACGCTATTCGCGAGTGGATAAGCCAAAAGAAAAGCTAACGGCAGGTTGTGGCCTCGCCGTGTAAGTTACCGAAGCGGGCTTAACTGCAGCATTCGCGAACCTAGGGTTTCAGGGGATGGCTGTTGTGACCTTATCACAGCGTGTTCGCATGCGGACTATCGTGCTATGCAGAGCGGTACCAGTGACCTCATATCAGGGATCGTTAGATGAAATGAGACAGCTCACACCAGGACTCATGACTTCGGTCTTATGGGCTGTCTGGAGGGACACGGGTGTCCCTAGAGCATTGGCAACGTAAATTAGAATGGTCGGCCCACGACCCTACAATGTCGTGAACACACGTGGTAGTTGCGCATCCGACCACCACGACGCGCAGATACGTGTCCAACTAACGTGACTCGGTCTCCCTGCCTGAGTCACCTGACCTTCCTTGATAAGGAGTTCGCCTAGGAATAACGCCTAGGCCGCAAATGGGTGTTTTTAGTGGAGCTATTGTGAAGTACGGTGCAAAACAAGCCCAAGTTCCCGGGCCGCGATTTATCGGCGAGTGTTGGAAAGCGCCGTCTAGCCCAACTTAATACACTCCCCAGACGACTTTCCTGCCGAGCCGCCGTAGACATTGCCGTTTCATTCACGGTGCCCCCTAGAAAAGGGATATACTGAAACGAAACTATCATGCTACCGAACTCTCTAGGGACCATGGTCTAGGCAGCCTGTGTCATATATTTAGATAGCCAGGGGGGATTGATTAAGATCACCCCGCAGGGATGTCCTCGCTGGCATGTCCTACTGTCTGGCCCTGATTTAAACTCCTCCGGCTTGCGCCCATAGATGTACTTGTGGCGTAGTACTGATTTACTGCGCCTCATCGTTCCCACCAAGACATCAGCGTGGTACAGGAACCTGCTCTACGAGGGATAGCAAATGAAGTAACAGAGTCTAGTAGTCAGCAGTTGATATGTGACGTTGACAGACAACTACGTATTTACGTTGCCTACGAAACATATGGAGCCTGAACAAAACCCAGAAGGAGTCACTGCTAACCCATGGGAACAAGGTCTACCCATGCCATTACACAGGCCCAGGGGGCTACTACCTGATTTGCACGACCGATCGGTGCAAGGTTACCCCTGACAGACGTTTCTGTCGGTTCATTCCTCGCGTTACTCCTGGGGTTACACCTGCAATGATCATAACTTATGAAATAGACCACTCGTAAGTATAGTTGCGTGTTCCCATACACCTCCCAATTTCGTAGTATACTGCCCGAGTTATTGGTCTCCTTAGTTTATGGCCTTTTTACCGACCTGTACGTGGCGGTTTTGTTCACCCTGAGCCCCCCCACATACGGATCGAATTTGAGCTCTAACGCCAGGAGGCACGTACGCAGCTTTCAGGGGATAGTGACCAAAGCGCGTTACCGCTGAGCACTCATAGAATGGGGATGTTAGTATAACCGTTTAAGTGGGATGACGACCGCGTCTCACCGCTGCTTCAAAGAATTCAGTGCTTCAGAGCTAGCTATCGGGAGGAAGGCTAGGTTCCAAGGCACAGGGAAGGCCCTCGGTTTGAGGTGCGGATAGAAGCGCACGCCCCAACCAGTAGGAAGGGTGTTAAAGACGATCAGACCGGATTTCTACTATTGCGTCCGGCCATCCCTTGAAGTCCTGCCCCCAAGGTGTGGACTGGAAAGGCAGATGCGCGTAAGTTCAACACTTTGACACTCAGCCACTTGTGGACAGGAGTCTGGTCGCGTGACCTTAAACTTGGCAGGCGGGGAAGTCCTACGCATTCTCCCTTGATAGAGCACGAGAGACTACATCGGTGCGTATTAGCAGCGCAAGTTGGCCCCTTATTCTAGCCTATGTTTTCCTGTATGCCGTTATCGTATGCCGGGATGATGGTTTTAAGAGGCCTGCGGTGGAGTGAAGCGTTAAATGATGGATCTTAGCTGCCTAACTCCCGGTCTAGATTGAGTGTAGCCGGGTCACAGCGGTAATCCACGCTGCCAATTTTCGTATCTTTATAGGTTGGCACTTAAGTCATGTCGGACAACTAGTTTCCCACGTTTCAAATGTACCTCTCTCAATCGCTCCGCATCCAGCCCGGGACGATAGCTGGAGGATGGGTGTGAAAGCTCAACGTCAAGTAAAAAACGGCCGACTACCTTGGTGCCGTATGTGGTGTGAAAGGAATTCCCCTTTTTTGTAGCCTTATGTACGACGTATTTGGACACCTTCTTACAGCCTCAAGTGGATGGTTGGTGTACGCCGCCCGCTGTCGAGTGACGAAGCTTTCGAGCCATAGTAGCAAACCTGACCGAAATATAGTCCTTATTCACAGCGGCTCCATTTAATTCGCGCGCCTGGGTAGAACAGGGGTGTATTGAAGGGTTATCCGGGAGTCGGTACGTCGCTAATGTTGAATTTGGAGGCATTAGTAAGTCCACCCTTTACTGATACATATAAGGGGGTATTCCGCTCCTACAGTGAGAACCTGTGTCGTAGCGCTCACATTGGTGGCCTGTAAAACCCTGATAAGTAGCTGTTGAGGACTATTCCGCGTCCGGCAATCGCCCTGGTCATTGGAAGTGTACCCACACCAGTTCAAAACCGGCGCGAATACCTAGTGCTTTGTTGACTTCTCACTGATTTCGGTCCTTAAGACACTGACTCCCGCTCCACTCGGGGGCATTGGGCTCGCGTGTTGATAAGGTATCACCCAACGCGAGGGCGGAGTATAAGACAGTAGAGAACACAATTATCTCATTTAACGTATTGACCGCTGGTCTGCCTACAGTCTCTATACCTATGCGCATACGTGATCTGAACCGATCTTGGTCGAGACGATATAGCGGTACTAGACGTCTAAGCGATTGGCAATAGTAACATCATCCGTTACGCTTAAGGACGGCCTACGCCTGGTGTTTCGGACAAGGCGTCTCGGTGCAGTCCGTTTGACTATGGGAGCTTCGGCCTTTGACAGAACCCTGTGCTTAAAGTGAACTATCGTGGACTGGAACTTTCCCAAGATTGTGATATTCGGCTGCTGACCACCAATCAAAAGTGATAGGCTACGGGACGCAAAAGTGTCGGTGTCGCAAGTATAATGTTGAGAGCGGTTGAGCGCGATGCGTTGTATGCTTGTCGTCGATTTAGTGATCCGCCGCGGGCTCTTTTACATTATTATAGCTTGTTCTAGCATGAATATTACCTGAACTCAATTCATCATTGCATTTATGCCCAACTGCTCTATGACATGGCTACACAATGAGAGATCCGGTGAGGACAAATACGCGATCCTCGAAACCGGCATGGGCTTCGCGATGATGTAATCCGGAATTAGGCCGGTCGAAATCTCGATGAATACCAACTCAGATGGAGGCGATGACCGCTATGTGCTTATACGCTATGTCAAGCACTCTCTGACCTCGTTGTTTGGGAGCAAGAATCTCTGGCCATCTTTCCAAGTAGCGAACTTCAGGGGAAGTGGCGTCCGTATAGTACAATCGAGTGCTTGTCGGTGCTTACTATCGACACACCGCAATACTTAGCGTTCTCTGTACGCTCCCGGTCGGACCAGCTGACGATTTTGCGAGCATCCTAGAGCGAGGCGAAAGTACAATATCCCATTGTTAGAGCGAACTAGCATATCAGAATAAACTGTAGCATTCACGCACTCATTGTCTCTAAAGTGAAATATTTCCAGTCACGCGCCCCATAGCGTAGAGAAATGGGATGTCCCTCACTCGACTCTCTTATAGTTCGGACATAAATGTCCCTTCAACCGTATCGACCACCGCCCGGCGAACTGCTCAAGCCGTCCCAGTACCCTAATAAAAGACTCACAGACCCAATAACCGCACTTAACCTTAGCTGCCCTTGGTTCATTAGACGCGAAAGAAAGTTTCGCGCTTCAAAATACTCCCCTGCCGCTACTCTCTAAGCGGAACGTCCTGCAGGCTTCATTATGGGAAGGTCAACATTCATCCGATAGTTGGAACCCACTCAGGATAAGTAGTCCAGTCGCCCGTTTTTAGGAATTGCGGAAGGGGCGCCTGAGTACTTACCTCAGACCTCGACTCACGAAGTTCGGAGATAGACTCGTTCATATTTTGCGCACTCTGAGCACGTTGGGCTTTAGACCGAATGGCGAATCTGTCTTTATAATTTTAACCTCACCTAGCGAATCTAGAATTCGGGCAATAAACGTGCTCTTCATACGTAATTGGACAAGTC
The Same String in the Source String:
"AACTTCAGCG"
Length: 10
The Start Pos: 518 3753
"ACCCTGAGCCC"
Length: 11
The Start Pos: 7091
"ACGGACACCC"
Length: 10
The Start Pos: 14 3450
"ACGGATCGAA"
Length: 10
The Start Pos: 3482 7110
"ACTTCAGCGA"
Length: 10
The Start Pos: 519 3754
"AGGACAAATA"
Length: 10
The Start Pos: 1322 9047
"ATGCGGGAGA"
Length: 10
The Start Pos: 992 2796
"CAGATGCGCG"
Length: 10
The Start Pos: 5656 7457
"CAGCGCGCTGG"
Length: 11
The Start Pos: 5062
"CATTCACGCA"
Length: 10
The Start Pos: 435 9431
"CCGGCATGGG"
Length: 10
The Start Pos: 1222 9071
"CGAGTAATAT"
Length: 10
The Start Pos: 3838 4970
"CTAACGCCTGG"
Length: 11
The Start Pos: 1012 1293
"CTTAGCTGCCCT"
Length: 12
The Start Pos: 9633
"CTTCAGCGAT"
Length: 10
The Start Pos: 520 3755
"GAGCTATTGT"
Length: 10
The Start Pos: 1856 6249
"GAGGAAGGCT"
Length: 10
The Start Pos: 5218 7291
"GAGTCCCGTT"
Length: 10
The Start Pos: 2778 3334
"GAGTCTGGTC"
Length: 10
The Start Pos: 4757 7505
"GCCATAGTAG"
Length: 10
The Start Pos: 739 8063
"GGGGTTGTCA"
Length: 10
The Start Pos: 3351 4216
"GGGTTGTCAG"
Length: 10
The Start Pos: 3352 4217
"GGTGTGAAAGG"
Length: 11
The Start Pos: 7945
"GTGGACTGGAA"
Length: 11
The Start Pos: 7443 8773
"TAGCCAAGGCC"
Length: 11
The Start Pos: 5641
"TCCACATCCGG"
Length: 11
The Start Pos: 4467
"TCTAAGCGATT"
Length: 11
The Start Pos: 8629
"TCTAGATTGA"
Length: 10
The Start Pos: 941 7729
"TCTCCACATCC"
Length: 11
The Start Pos: 3692 4465
"TGTCTGCACG"
Length: 10
The Start Pos: 295 5005
"TTAATTACTG"
Length: 10
The Start Pos: 2171 4247
"TTATGATATG"
Length: 10
The Start Pos: 595 5110
"TTCACCCGTT"
Length: 10
The Start Pos: 1530 3188
"TTCAGCGATAAG"
Length: 12
The Start Pos: 3756
"TTCAGGGGAT"
Length: 10
The Start Pos: 5870 7155
"TTCTAGCCAA"
Length: 10
The Start Pos: 75 1073
"TTGTATCAGC"
Length: 10
The Start Pos: 1234 1539
"TTTCAGGGGA"
Length: 10
The Start Pos: 5869 7154 可以看到,大超过10位时出现错误,也就是更长的重复字符串的处理有还不正确,(估计是哪个pos搞错了, )
说明两点:
1>;输出顺序与查找到的顺序不同.
因为我用的是set,又防止会出现重复查找问题,但set又会自动排序,但也可以用普通容器,但最后要先排序再唯一,但我觉得效率差不多,可能后者更好.毕竟从10000的长度来看重复的数据没多少. 所以效率不会低.
2>;考虑特殊情况.
以10位长度及至少4位相同字符串时为例,如果源字符串为AAAAAAAAAA
那么找到的结果将是 输出的下标我从1而非0开始的)
AAAAA 1 6
AAAA 3 7
这样是否是大重复字符串里出现小重复字符串呢?我认为看你怎么看了,或者题目再给更清楚一点也行.
a)我认为不是:因为第二个重复串的开始及未尾位置也就是3,6并不在第一个大重复字符串的区间内,也就是1,5之内,所以不是大中包含小的.
b)可以认为是:如果你明确要求两个任何重复字符串不能有重叠那么这里可以算是子重复字符串了,毕竟这样也合理,不过这倒是可以更容易也更高效的处理了,只为pos的跨度更大. 而前面处理的情况更多些.
希望晚上再找出那个BUG.
我用记事本验证了一下,能找到的基本上是正确的(除了那个有BUG的更长的串),就不知道有没有重复串没找到的情况. |
|