字符串：KMP

KMP是字符串匹配的经典算法

也是众多字符串基础的重中之重

题意：给T组数据，每组有长度为n和m的母串和模式串。判断模式串是否是母串的子串，如果是输出最先匹配完成的位置，否则输出-1.

做法：直接套用模板。把char改成int。kmp函数中在模式串遍历到结尾的时候return，若没遍历到结尾，也就是不是子串返回-1

 [cpp] view plain copy

 #include <iostream>

 #include  <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[],a[],s[];

 int n,m;

 void getnexta(int s[])

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;  

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int kmp(int s[],int t[])//t模式串，s母串

 {

     getnexta(t);  

     int i = ,j = ;

     while(i < n && j < m)

     {

         if(j == - || s[i] == t[j])

         {

             i ++;

             j ++;

         }

         else

         {

             j = nexta[j];

         }

         if(j == m)

         {

             return i - j+ ;

         }

     }

     return -;

 }

 int main()

 {

    // freopen("in.txt","r",stdin);

     int T;

     scanf("%d",&T);

     while(T--)

     {

         scanf("%d%d",&n,&m);

         for(int i = ;i < n; i ++)

         {

             scanf("%d",&a[i]);

         }

         for(int j = ; j < m;j ++)

         {

             scanf("%d",&s[j]);

         }

         printf("%d\n",kmp(a,s));

     }

     return ;

 }

题意：给T组数据，每组有两个字符串按顺序分别为模式串和母串。判断模式串在母串中出现的次数。模式串在母串中是可以相互覆盖的。

做法：直接套用模板。在kmp中当j==m也就是模式串完全匹配时，ans++，且j = nexta[j]

 [cpp] view plain copy

 #include <iostream>

 #include  <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char t[],s[];

 void getnexta(char s[])

 {

     memset(nexta,,sizeof(nexta));

     int n = strlen(s);

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int kmp(char s[],char t[])//t模式串，s母串.此种为返回首次匹配的位置，不能匹配则返回-1.

 {

     getnexta(t);

     int ans = ;

     int n = strlen(s),m = strlen(t);

     int i = ,j = ;

     while(i < n && j < m)

     {

         if(j == - || s[i] == t[j])

         {

             i ++;

             j ++;

         }

         else

         {

             j = nexta[j];

         }

         if(j == m)//根据题目要求改变

         {

             ans ++;

             j = nexta[j];

         }

     }

     return ans;

 }

 int main()

 {

    // freopen("in.txt","r",stdin);

     int T;

     scanf("%d",&T);

     while(T--)

     {

         scanf("%s%s",t,s);

         printf("%d\n",kmp(s,t));

     }

     return ;

 }

题意：输入母串和模式串，以’#‘为结束。剪纸花，从母串中减去模式串问能剪出多少。这就意味着求解模式串的数量时不能重叠覆盖

做法：模板。在kmp中当j==m也就是模式串完全匹配时，ans++，且j = 0.要再次从头开始匹配

 [cpp] view plain copy

 #include <iostream>

 #include  <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char t[],s[];

 void getnexta(char s[])

 {

     memset(nexta,,sizeof(nexta));

     int n = strlen(s);

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int kmp(char s[],char t[])//t模式串，s母串.此种为返回首次匹配的位置，不能匹配则返回-1.

 {

     getnexta(t);

     int ans = ;

     int n = strlen(s),m = strlen(t);

     int i = ,j = ;

     while(i < n && j < m)

     {

         if(j == - || s[i] == t[j])

         {

             i ++;

             j ++;

         }

         else

         {

             j = nexta[j];

         }

         if(j == m)//根据题目要求改变

         {

             ans ++;

             j = ;

         }

     }

     return ans;

 }

 int main()

 {

    // freopen("in.txt","r",stdin);

     while()

     {

         scanf("%s",s);

         if(strcmp(s,"#") == )

             break;

         scanf("%s",t);

         printf("%d\n",kmp(s,t));

     }

     return ;

 }

题意：给T组数据，每组有一个字符串，只能在字符串的前面和后面增加字符，不能再中间增加，求要使这个字符串是周期循环的且周期的次数大于一，至少需要增加的字符数量。注意这个字符串是个手链，也就是说是增加字符后首位相连是周期的即可

做法：首先求最小循序节，考虑一种特殊情况就是nexta[n] = 0,这个时候前缀没有匹配后缀的地方，所以需要增加n个字符。求出最小循环节：n - nexta[n]。当n整除循环节时候，这时字符串已经是周期循环。当不整除时，最小循序节减去已经在字符串中的字符数目及ans = temp - (n % temp);（temp为最小循环节）

 [cpp] view plain copy

 #include <iostream>

 #include  <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char s[];

 void getnexta(char s[])

 {

     memset(nexta,,sizeof(nexta));

     int n = strlen(s);

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }

 }

 int main()

 {

    // freopen("in.txt","r",stdin);

    int T,ans,n,temp;

    scanf("%d",&T);

     while(T --)

     {

         scanf("%s",s);

         n = strlen(s);

         getnexta(s);

         temp = n - nexta[n];//最小循环节

         if(temp == n)

         {

             ans = n;

         }

         else if(n % temp == )

         {

             ans = ;

         }

         else

         {

             ans = temp - (n % temp);

         }

         printf("%d\n",ans);

     }

     return ;

 }

题意：给字符串的长度和一个字符串。读到eof。求每个字符串中在i之前的位置是循环的且次数大于1，求这个位置i以及循环的次数

做法：求每个位置i的最小循环节，判断是否整除和个数大于1，并用除法的值求次数

 [cpp] view plain copy

 #include <iostream>

 #include <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char s[];

 int n;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int main()

 {

    // freopen("in.txt","r",stdin);

         int t = ,temp;

         while()

         {

             t ++;

             scanf("%d",&n);

             if(n == )

                 break;

             scanf("%s",s);

             printf("Test case #%d\n",t);

             getnexta();

             for(int i = ; i <= n; i ++)

             {

         //cout<<nexta[i]<<" ";

         //cout<<f[i]<<endl;

                 if(nexta[i] == )

                 {

                     continue;

                 }

                 else

                 {

                     temp = i - nexta[i] ;//循环小节的长度

                     if((i ) % temp ==  && (i ) / temp  > )//这是由于nexta[i]表示的是i-1

                         printf("%d %d\n",i ,(i ) / temp);

                 }  

             }

         printf("\n");

         }

         return ;

 }  

 /*

 a  a  b  a  a  b  a  a  b  a  a  b

 -1 0  1  0  1  2  3  4  5  6  7  8

 0  1  2  3  4  5  6  7  8  9  10 11

 */

题意：每组给一个字符串，一直读到eof。这个字符串是重复写的AAAAAAA的一部分，求这个A最短是多少

做法：。。。。。。此题有问题，全网代码没有对的，我至少交了8+份代码

题意：每组给以个字符串，一直读到'.'.字符串s = a^n，及都是由a构成的，求n的值

做法：求最小循环节，如果整除，那除得的数及为ans。如果不整除ans = 1

 [cpp] view plain copy

 #include <iostream>

 #include <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char s[];

 int n;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int main()

 {

     //freopen("in.txt","r",stdin);

     int ans;

     while()

     {

         ans = ;

         scanf("%s",s);

         if(strcmp(s,".") == )

             break;

         n = strlen(s);

         getnexta();

         if(n % (n - nexta[n])  ==  )

             ans  = n / (n - nexta[n]);

         else

             ans = ;

         printf("%d\n",ans);

     }

     return ;

 }  

 //ababa

题意：每组一个字符串，读到eof结束。寻找i使得字符串的前缀等于后缀

做法：首先n（字符串的长度）肯定是，因为此时前缀和后缀是一样的。对nexta[n]进行递归。及i= nexta[i].当nexta[i] == 0时结束。因为是nexta找到的所以以i为结束的字符串后缀等于以n为结束的字符串的后缀。可以看看kmp算法讲解中的图，体会一下

 [cpp] view plain copy

 #include <iostream>

 #include <cstdio>

 #include <cstring>

 using namespace std;

 int nexta[];

 char s[];

 int ans[];

 int n;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }  

 int main()

 {

     //freopen("in.txt","r",stdin);

     int temp,k;

     while(scanf("%s",s) != EOF)

     {

         k = ;

         if(strcmp(s,".") == )

             break;

         n = strlen(s);

         getnexta();

         temp = n;

         ans[k] = n;

         k ++;

         while(nexta[temp]!= -)

         {

             temp = nexta[temp];

             ans[k] = temp;

             k ++;

         }

         for(int i = k -; i > ; i --)

             printf("%d ",ans[i]);

         printf("%d\n",ans[]);  

     }

     return ;

 }  

 //ababa

题意：T组数据，每组m个DNA序列，每个DNA序列都有60个字符，且只由ACGT几个字母构成。判断m个DNA序列最长公共的子串是什么？如果有相同长度的公共子串，则输出字典序最小的。如果小于3输出“no ……”，大于等于3输出字符串

做法：对第一个DNA序列取从i开始到结尾的子串。与其他DNA序列进行匹配。因为是从前向后匹配，在kmp时做出改变，求出每个DNA序列和子串匹配的最长长度，再对所有最长长度取最短的那个。注意对长度相等时的处理。其中还用到strncpy,可以复制一定长度的字符串到指定字符串中。注意在最后加'\0'

 [cpp] view plain copy

 //直接枚举第一串的所有后缀，然后与后面的所有串进行比较，判断有几个字母是相同的即可

 #include <iostream>

 #include <cstdio>

 #include <cstring>

 #include <algorithm>

 using namespace std;

 int nexta[];

 char c[][];

 char s[];

 int n,m,l;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int kmp()

 {

     int k = ,j = -;

     int maxx = ,temp = ;

     for(int i =  ;i < m; i ++)

     {

         temp = ;j = ,k = ;

         while(j < l && k < )

         {

             if(j == - || c[i][k] == s[j])

             {

                 j ++;

                 k ++;  

             }  

             else

                 j = nexta[j];

             if(j > temp)//每个DNA序列和子串匹配的最长长度

             {

                 temp = j;

             }  

         }

         if(temp < maxx)//所有DNA序列都和子串匹配的长度

             maxx = temp;

     }

     return maxx;

 }

 int main()

 {

     //freopen("in.txt","r",stdin);

     int T,temp,num;

     n = ;

     scanf("%d",&T);

     while(T--)

     {

         char result[];

         char t[];

         num = ;

         scanf("%d",&m);

         for(int i = ; i < m; i ++)

         {

             scanf("%s",&c[i]);

         }

         for(int i = ; i < ; i ++)

         {

             l =  - i;

             strcpy(s,c[] + i);

             getnexta();

             temp = kmp();

             if(num == temp)

             {

                 strncpy(t,c[] + i,temp);

                 if(t < result)

                 {

                     strcpy(result,t);

                 }

                 t[temp] = '\0';

             }

             else if(num < temp)

             {

                 strncpy(result,c[] + i,temp);

                 result[temp] = '\0';

                 num = temp;

             }

         }

         //cout<<num<<endl;

         if(num >= )

         {

             printf("%s\n",result);

            // cout<<num<<endl;

         }

         else

             printf("no significant commonalities\n");

     }

     return ;

 }  

 //ababa

题意：每组给两个字符串以eof结尾。求s1的前缀，等于s2后缀的长度.如果长度不是零空格之后输出相同的部分，否则只输出零即可

做法：把s2接在s1后面，求nexta[n].注意求得的ans长度不能大于s1或s2

 [cpp] view plain copy

 /*考虑abcabcabcabc

 abcabcabcabcabc这组数据也就是考虑当得数大于s1或s2时

 */

 #include <iostream>

 #include <cstdio>

 #include <cstring>

 #include <string>

 using namespace std;

 int nexta[];

 char s[];

 char a[];

 int n;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     int k = -,j = ;

     nexta[] = -;

     while(j < n )

     {  

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }  

 }

 int main()

 {

     // freopen("in.txt","r",stdin);

     while(scanf("%s%s",s,a) != EOF)

     {

         int ans = ;

         strcat(s,a);

         int m = strlen(a);

         n = strlen(s);

         getnexta();

         //cout<<s<<endl;

         //cout<<m<<endl;

         // cout<<n<<endl;

         ans = nexta[n];

         //cout<<n<<endl;

         if(ans != )

         {

             if(ans > n || ans > m)

                 ans = min(n - m,m);

             for(int i = ; i < ans; i ++)

                 printf("%c",s[i]);

             printf(" %d\n",ans);  

         }

         else

             printf("0\n");

     }

     return ;

 }

题意：给T组数据，每组数据给一个长度为n的字符串s。求字符串每个前缀出现的次数，结果mod 10007

做法：dp[i]表示的是长度为i的字符串的前缀的个数。dp[i] = dp[nexta[i]] + 1。以i-1为结尾的后缀的个数是next[i],也是前缀的长度。这个前缀的长度中字符串本身的前缀出现的次数。因为以i - 1为后缀的字符串中都又出现了一次

ababa

dp[1] = 1 a

dp[2] = 1 ab

dp[3] = 2 aba a

dp[4] = 2 abab ab

dp[5] = 3 ababa aba a

 [cpp] view plain copy

 #include <iostream>

 #include <cstdio>

 #include <algorithm>

 #include <list>

 #include <map>

 #include <stack>

 #include <vector>

 #include <cstring>

 #include <sstream>

 #include <string>  

 using namespace std;

 char s[];

 int nexta[];

 int dp[];

 int n;

 void getnext()

 {

     int j = ,k = -;

     nexta[] = -;

     while(j < n)

     {

         if(k == - || s[j] == s[k])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }  

     }

 }

 int main()

 {

     //freopen("in.txt","r",stdin);

     int T,ans;

     scanf("%d",&T);

     while(T--)

     {

         scanf("%d",&n);

         scanf("%s",s);

         getnext();

         memset(dp,,sizeof(dp));

         ans = ;

         for(int i = ;i <= n; i ++)

         {

             dp[i] = dp[nexta[i]] + ;

             ans += dp[i] % ;

         }

         printf("%d\n",ans % );

     }

     return ;

 }

题意：给T组数据，每组数据第一行是26个字母表示[a,z]所对应的密文字母。第二行的字符串由两部分组成，第一部分是密文部分，第二部分是明文部分。明文部分可能是不完整的，也可能是完整的输出完整的明文部分

做法：先输出第二行的全部字符串。然后对整个字符串进行变化，把密文部分转化为明文部分。原串密文部分的长度一定大于等于明文部分。明文部分最长就是从整个字符串的一半开始的。将转化后的字符串与未转化之前的字符串的后半部分进行匹配。匹配到的返回结果就是原字符串中明文的个数：temp。则密文的个数为n - temp.实际应该的长度为2 * n - 2 * temp.应该输出的长度为：2 * n - 2 * temp - n.从转换后的字符串的temp位开始直接输出即可。从该位置开始到字符串的长度减去该位置都是待输出的不完整的字符串。其实质就是用完整的明文串，与题目中给出的不一定完整的明文串进行匹配。注意其中完整的是模式串，不完整的是所谓的母串。模式串是从头开始匹配的。而母串不是

提一下如何将密文转化为明文，将第一行该位置对应的字母转化为该位置i转化为字母（i + 'a')

ps：这也就意味着，不要从头开始匹配，是母串。而普通kmp的模式串一定要是从头开始匹配的。

abcdefghijklmnopqrstuvwxyz

abcdab

 [cpp] view plain copy

 //密文和明文前后两端是重复的

 #include <iostream>

 #include <cstdio>

 #include <algorithm>

 #include <list>

 #include <map>

 #include <stack>

 #include <vector>

 #include <cstring>

 #include <sstream>

 #include <string>  

 using namespace std;

 map<char,char>mapp;

 char s[];

 char s2[];

 int nexta[];

 char c[];

 int n;

 void getnext()

 {

     memset(nexta,,sizeof(nexta));

     int j = ,k = -;

     nexta[] = -;

     while(j < n)

     {

         if(k == - || s[j] == s[k])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }  

     }

 }

 int kmp()

 {

     int la = strlen(s2),lb = strlen(s);

     getnext();

     int i = , j = ;

     while(i < la && j < lb)

     {

         if(j == - || s2[i] == s[j])

         {

             i ++;

             j ++;  

         if(i == la)

             return j;

         }

         else

         {

             j = nexta[j];

         }

     }

     return ;

 }

 int main()

 {

    //freopen("in.txt","r",stdin);

     int T;

     scanf("%d",&T);

     while(T--)

     {

         scanf("%s%s",c,s);

         printf("%s",s);

         n = strlen(s);

         int m = (n + ) / ;

         strcpy(s2,s + m);  

         for(int i = ; i < ; i ++)

         {

             mapp[c[i]] = 'a' + i;

         }

         for(int i = ; i < n; i ++)

         {

             s[i] = mapp[s[i]];

         }

         int temp = kmp();

        // cout<<temp<<endl;

         for(int i =temp; i < n - temp; i ++)//n-temp密文长度

         {

             printf("%c",s[i]);

         }

         printf("\n");

     }

     return ;

 }

题意：每组n个字符串，以eof结束。求每个字符串中满足S[i]=S[i+P] for i in [0..SIZE(S)-p-1],的位置。

做法：其实还是前缀与后缀相等，其中p是后缀开始的地方。递归nexta即可。因为nexta递归得到的以i为结束后缀等于前缀，等于整个长度的后缀.注意p的大小是n - nexta[n](nexta[n]是后缀的长度，n - nexta[n]就是后缀开始的位置了）

 #include <iostream>

 #include <cstring>

 #include <cstdio>

 using namespace std;

 int nexta[];

 char s[];

 int ans[];

 int n;

 void getnexta()

 {

     int j = ,k = -;

     nexta[] = -;

     while(j < n)

     {

         if(k == - || s[j] == s[k])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         {

             k = nexta[k];

         }

     }

 }

 int main()

 {

   //  freopen("in.txt","r",stdin);

     int T;

     scanf("%d",&T);

     for(int t=; t<= T; t++)

     {

         scanf("%s",s);

         n = strlen(s);

         getnexta();

         int a = n;

         int num = ;

         while(nexta[a] != -)

         {

             ans[num] = n - nexta[a];

             num ++;

             a = nexta[a];

         }

         printf("Case #%d: %d\n",t,num);

         for(int i = ; i < num - ; i ++)

         {

             printf("%d ",ans[i]);

         }

         printf("%d\n",ans[num - ]);

     }

     return ;

 }

题意：n个字符串，每个字符串都可以写成EAEBE的形式，其中E可以为空，寻找最长的E

做法：首先我们很容易知道E最长长度为nexta[n]，然后判断中间的那个E是否存在，设E的长度为i。从开头位置加1开始遍历（i+1)，到<（n - i)为止，如果有next[j] == i那么可以判断这个长度的E存在。从E可能的最大长度开始遍历

 [cpp] view plain copy

 #include<iostream>

 #include<cstring>

 #include<stdio.h>

 using namespace std;

 char s[];

 char t[];

 int nexta[];

 int n;

 void getnexta()

 {

     memset(nexta,,sizeof(nexta));

     nexta[] = -;

     int k = -,j = ;

     while(j < n)

     {

         if(k == - || s[k] == s[j])

         {

             nexta[j + ] = k + ;

             j ++;

             k ++;

         }

         else

         k = nexta[k];

     }

 }  

 int main()

 {

   //  freopen("in.txt","r",stdin);

     int T;

     scanf("%d",&T);

     while(T--)

     {

         scanf("%s",s);

         n = strlen(s);

         getnexta();

         //cout<<nexta[n]<<endl;

         int ans = ,i;

         bool flag ;

         for(i = min(n /  - ,nexta[n] - );i >= ;i --)

         {

             //cout<<i<<endl;

             for(int j = i + ; j < n - i; j ++)

             {

                 flag = false;

                 if(nexta[j] == i )

                 {

                     ans = i + ;

                     flag = true;

                     break;

                 }

             }

             if(flag)

                 break;

         }

         if(i == -)

             ans = ;

         cout<<ans<<endl;

     }

     return ;

 }

使用高级算法会减少思维难度，相比较来说，高级算法虽然难度大，但是实用性更强

巴特西

字符串：KMP

最新文章

热门文章