POJ 2778 DNA Sequence

Description

It’s well known that DNA Sequence is a sequence only contains $\text A$ , $\text C$ , $\text T$ and $\text G$ , and it’s very useful to analyze a segment of DNA Sequence. For example, if a animal’s DNA sequence contains segment $\text{ATC}$ then it may mean that the animal may have a genetic disease. Until now scientists have found several those segments, the problem is how many kinds of DNA sequences of a species don’t contain those segments.
Suppose that DNA sequences of a species is a sequence that consist of $\text A$ , $\text C$ , $\text T$ and $\text G$ , and the length of sequences is a given integer $n$ .

Input

First line contains two integer $m$ ( $0 \le m \le 10$ ), $n$ ( $1 \le n \le 2000000000$ ). Here, $m$ is the number of genetic disease segment, and n is the length of sequences.
Next $m$ lines each line contain a DNA genetic disease segment, and length of these segments is not larger than $10$ .

Output

An integer, the number of DNA sequences, $\bmod 100000$ .

Sample Input

4 3
AT
AC
AG
AA

Sample Output

Idea

设字典树可接受字符集为小写英文字母。将 $m$ 个病毒串插入字典树，并为病毒串末尾打上标记。我们的目标是，从树根出发，走 $n$ 层构造出一个字符串 $s$ ，且 $s$ 的任意子串都不是病毒串。
上述方法存在两个问题。第一，我们仅仅在字典树上插入了病毒串，若要使得构造的字符串 $s$ 能够使用字符集中的所有字母，需要存一棵 $26$ 叉树，这显然是不可能的。第二，如果我们仅仅在病毒串的结尾打标记，会对病毒串的检索遗漏，比如插入两个字符串 $\text{ACG}$ 和 $\text C$ ，我们在两个字符串的结尾 $\text G$ 和 $\text{C}$ 打上标记；然而，若游走到了树上表示 $\text{AC}$ 的位置，实际上已经包含病毒串 $\text C$ ，但是我们并没有在这个位置打上标记。
POJ 2778.png
对于上述两个问题，在字典树的基础上构建AC自动机就可以解决。在构建AC自动机时，对于指向空的状态转移 $trie(u,i)$ ，不再为其分配结点，而是将其指向 $trie(fail_u,i)$ ，达到了路径压缩的目的，由于 $fail_u$ 指针指向了 $u$ 的后缀，我们可以理解为省略了 $u$ 的前缀而继续向下寻找；同时 $fail$ 指针具有良好的性质， $fail_u$ 指向的字符串是 $u$ 表示的字符串的最长可匹配后缀，因此若 $fail_u$ 被打上标记，则 $u$ 也应当打上标记。
图的邻接矩阵 $A$ ， $A^n_{ij}$ 代表在图上从 $i$ 出发走 $n$ 步到达 $j$ 的路径数量。我们可构造AC自动机上的邻接矩阵：若 $trie(i,j)$ 没有被打上标记，则 $A_{i,trie(i,j)}=0$ ，否则 $A_{i,trie(i,j)}$ 增加 $1$ 。在此说明为什么是 $A_{i,trie(i,j)}$ 增加 $1$ ，而不是令 $A_{i,trie(i,j)}=1$ ：因为AC自动机在构建 $fail$ 指针的过程中顺带完成了路径压缩的工作，会有多条边指向 $trie(i,j)$ 。
在AC自动机上游走 $n$ 步， $\sum\limits_{i=0}^{tot}A_{0,i}^n$ 即为答案， $tot$ 为AC自动机上最大点的编号，树根编号为 $0$ 。

Code

/******************************************************************
Copyright: 11D_Beyonder All Rights Reserved
Author: 11D_Beyonder
Problem ID: POJ 2778
Date: 2021 Mar. 31st
Description: AC Automaton, Quick Matrix Pow, Graph Theory
*******************************************************************/
#include<iostream>
#include<cstdio>
#include<queue>
#include<map>
#include<cstring>
using namespace std;
typedef long long ll;
const int CHARACTER_SET_SIZE = 4;
const int MOD = 100000;
const int MAX_M = 12;
const int MAX_LEN = 11;
int m, n;
int trie[MAX_M * MAX_LEN][CHARACTER_SET_SIZE];
int tot;
bool permitted[MAX_M * MAX_LEN];
int fail[MAX_M * MAX_LEN];
char str[MAX_LEN];
map<char, short>_;
struct Matrix {
	ll m[MAX_M * MAX_LEN][MAX_M * MAX_LEN];
	Matrix() {
		memset(m, 0, sizeof(m));
	}
};
Matrix operator*(const Matrix& x, const Matrix& y);
Matrix quickPow(Matrix a, ll b);
Matrix initialMatrix();
inline void init();
void insert(char s[]);
void buildACAutomaton();
int main() {
	cin >> m >> n;
	init();
	for (int i = 1; i <= m; i++) {
		scanf("%s", str);
		insert(str);
	}
	buildACAutomaton();
	Matrix final = quickPow(initialMatrix(), n);
	ll ans = 0;
	for (int i = 0; i <= tot; i++) {
		ans = (ans + final.m[0][i]) % MOD;
	}
	cout << ans << endl;
	return 0;
}
inline void init() {
	//字母表 方便映射
	_['A'] = 0;
	_['C'] = 1;
	_['T'] = 2;
	_['G'] = 3;
	memset(permitted, true, sizeof(permitted));
}
void insert(char s[]) {
	int len = strlen(s), p = 0;
	for (int i = 0; i < len; i++) {
		int ch = _[s[i]];
		if (!trie[p][ch]) {
			trie[p][ch] = ++tot;
		}
		p = trie[p][ch];
	}
	//a genetic disease isn't permitted
	permitted[p] = false;
}
void buildACAutomaton() {
	queue<int>q;
	for (int i = 0; i < CHARACTER_SET_SIZE; i++) {
		if (trie[0][i]) {
			q.push(trie[0][i]);
		}
	}
	while (!q.empty()) {
		int u = q.front();
		q.pop();
		//---------------------------------------------
		//If the suffix of a string is not allowed
		//then the string is not allowed
		//---------------------------------------------
		if (!permitted[fail[u]]) {
			permitted[u] = false;
		}
		for (int i = 0; i < CHARACTER_SET_SIZE; i++) {
			if (trie[u][i]) {
				fail[trie[u][i]] = trie[fail[u]][i];
				q.push(trie[u][i]);
			} else {
				trie[u][i] = trie[fail[u]][i];
			}
		}
	}
}
Matrix operator*(const Matrix& x, const Matrix& y) {
	Matrix res;
	for (int i = 0; i <= tot; i++) {
		for (int j = 0; j <= tot; j++) {
			for (int k = 0; k <= tot; k++) {
				res.m[i][j] += x.m[i][k] * y.m[k][j];
				res.m[i][j] %= MOD;
			}
		}
	}
	return res;
}
Matrix quickPow(Matrix a, ll b) {
	Matrix res;
	for (int i = 0; i <= tot; i++) {
		res.m[i][i] = 1;
	}
	while (b) {
		if (b & 1) {
			res = res * a;
		}
		b >>= 1;
		a = a * a;
	}
	return res;
}
Matrix initialMatrix() {
	Matrix res;
	for (int i = 0; i <= tot; i++) {
		//the node i isn't permitted to reach
		if (!permitted[i]) {
			continue;
		}
		for (int j = 0; j < CHARACTER_SET_SIZE; j++) {
			//one of the sons of node i isn't permitted to reach
			if (!permitted[trie[i][j]]) {
				continue;
			} else {
				res.m[i][trie[i][j]]++;
			}
		}
	}
	return res;
}