課本 page. 406
15-5 Edit distance
Inorder to transform one source string of text x [1 ¬ m] to a target string y [1¬ n], we can perform various transformation operations. Our goal is, given xand y, to produce a series of transformations that change x to y. We use anarray z—assumed to be large enough to hold all the characters it will need—tohold the intermediate results. Initially, z is empty, and at termination, weshould have z[j] = y[j] for j = 1, 2, ..., n. We maintain current indices iinto x and j into z, and the operations are allowed to alter z and theseindices. Initially, i = j = 1. We are required to examine every character in xduring the transformation, which means that at the end of the sequence oftransformation operations, we must have i = m + 1. There are six transformationoperations:
• Copy a character from x to z by setting z[j] ← x[i] and then incrementingboth i and j. This operation examines x[i].
• Replace a character from x by another character c, by setting z[j] ← c, andthen incrementing both i and j. This operation examines x[i].
• Delete a character from x by incrementing i but leaving j alone. Thisoperation examines x[i].
• Insert the character c into z by setting z[j] ← c and then incrementing j,but leaving i alone. This operation examines no characters of x.
• Twiddle (i.e., exchange) the next two characters by copying them from x to zbut in the opposite order; we do so by setting z[j] ← x[i + 1] and z[j + 1] ←x[i] and then setting i ← i + 2 and j ← j + 2. This operation examines x[i] andx [i + 1].
• Kill the remainder of x by setting i ← m + 1. This operation examines allcharacters in x that have not yet been examined. If this operation isperformed, it must be the final operation.
效率 O(n*m), 挺差的
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <algorithm>
using namespace std;
int dp[1000][1000], lx, ly;
int arg_dp[1000][1000];
char x[1000], y[1000], buf[1000];
void printState(int ti, int tj) {
printf("%-15s", buf);
static int i;
for(i = 0; i < lx; i++) {
if(i == ti) putchar('[');
printf("%c", x[i]);
if(i == ti) putchar(']');
}
if(i == ti) printf("[]");
for(; i < 20; i++) putchar(' ');
for(i = 0; i < tj; i++)
printf("%c", y[i]);
printf("[]");
puts("");
}
void print(int i, int j) {
if(i == 0 && j == 0) {
sprintf(buf, "initial");
printState(i, j);
return;
}
if(arg_dp[i][j] == -1) {
print(i-1, j-1);
sprintf(buf, "copy");
} else if(arg_dp[i][j] == -2) {
print(i-1, j-1);
sprintf(buf, "replace by %c", y[j-1]);
} else if(arg_dp[i][j] == -3) {
print(i-1, j);
sprintf(buf, "delete");
} else if(arg_dp[i][j] == -4) {
print(i, j-1);
sprintf(buf, "insert %c", y[j-1]);
} else if(arg_dp[i][j] == -5) {
print(i-2, j-2);
sprintf(buf, "twiddle");
} else {
print(arg_dp[i][j], j);
sprintf(buf, "kill");
}
printState(i, j);
}/*
algorithm altruistic
*/
int main() {
int i, j;
while(scanf("%s %s", x, y) == 2) {
lx = strlen(x), ly = strlen(y);
for(i = 0; i <= lx; i++) {
for(j = 0; j <= ly; j++) {
dp[i][j] = 0xffff;
}
}
dp[0][0] = 0;
int cost[6] = {1,1,1,1,1,1};
for(i = 0; i <= lx; i++) {
for(j = 0; j <= ly; j++) {
//copy
if(x[i] == y[j]) {
dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j]+cost[0]);
if(dp[i+1][j+1] == dp[i][j]+cost[0])
arg_dp[i+1][j+1] = -1;
}
//replace
if(x[i] != y[j]) {
dp[i+1][j+1] = min(dp[i+1][j+1], dp[i][j]+cost[1]);
if(dp[i+1][j+1] == dp[i][j]+cost[1])
arg_dp[i+1][j+1] = -2;
}
//delete
dp[i+1][j] = min(dp[i+1][j], dp[i][j]+cost[2]);
if(dp[i+1][j] == dp[i][j]+cost[2])
arg_dp[i+1][j] = -3;
//insert
dp[i][j+1] = min(dp[i][j+1], dp[i][j]+cost[3]);
if(dp[i][j+1] == dp[i][j]+cost[3])
arg_dp[i][j+1] = -4;
//twiddle
if(x[i] == y[j+1] && x[i+1] == y[j]) {
dp[i+2][j+2] = min(dp[i+2][j+2], dp[i][j]+cost[4]);
if(dp[i+2][j+2] == dp[i][j]+cost[4])
arg_dp[i+2][j+2] = -5;
}
//kill
dp[lx][j] = min(dp[lx][j], dp[i][j]+cost[5]);
if(dp[lx][j] == dp[i][j]+cost[5])
arg_dp[lx][j] = i;
}
}
printf("min cost %d\n", dp[lx][ly]);
print(lx, ly);
}
return 0;
}
後面有一個延伸問題 DNA Sequence Alignment, 利用原本的 edit distance 求此問題, 很明顯地就是 copy, replace delete 的花費對照
O(n*n)
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <algorithm>
using namespace std;
int dp[1000][1000], lx, ly;
int arg_dp[1000][1000];
char x[1000], y[1000], buf[1000];
char mx[1000], my[1000], *px, *py;
void print(int i, int j) {
if(i == 0 && j == 0)
return;
if(arg_dp[i][j] == -1) {
print(i-1, j-1);
*px = x[i-1], *py = y[j-1];
px++, py++;
} else if(arg_dp[i][j] == -2) {
print(i-1, j-1);
*px = x[i-1], *py = y[j-1];
px++, py++;
} else if(arg_dp[i][j] == -3) {
print(i-1, j);
*px = x[i-1], *py = ' ';
px++, py++;
} else if(arg_dp[i][j] == -4) {
print(i, j-1);
*px = ' ', *py = y[j-1];
px++, py++;
}
}/*
GATCGGCAT CAATGTGAATC
CAATGTGAATC GATCGGCAT
*/
int main() {
int i, j;
while(scanf("%s %s", x, y) == 2) {
lx = strlen(x), ly = strlen(y);
for(i = 0; i <= lx; i++) {
for(j = 0; j <= ly; j++) {
dp[i][j] = -0xffff;
}
}
dp[0][0] = 0;
int cost[6] = {1,-1,-2,-2};
for(i = 0; i <= lx; i++) {
for(j = 0; j <= ly; j++) {
//copy
if(x[i] == y[j]) {
dp[i+1][j+1] = max(dp[i+1][j+1], dp[i][j]+cost[0]);
if(dp[i+1][j+1] == dp[i][j]+cost[0])
arg_dp[i+1][j+1] = -1;
}
//replace
if(x[i] != y[j]) {
dp[i+1][j+1] = max(dp[i+1][j+1], dp[i][j]+cost[1]);
if(dp[i+1][j+1] == dp[i][j]+cost[1])
arg_dp[i+1][j+1] = -2;
}
//delete
dp[i+1][j] = max(dp[i+1][j], dp[i][j]+cost[2]);
if(dp[i+1][j] == dp[i][j]+cost[2])
arg_dp[i+1][j] = -3;
//insert
dp[i][j+1] = max(dp[i][j+1], dp[i][j]+cost[3]);
if(dp[i][j+1] == dp[i][j]+cost[3])
arg_dp[i][j+1] = -4;
}
}
printf("max score %d\n", dp[lx][ly]);
px = mx, py = my;
print(lx, ly);
*px = '\0', *py = '\0';
printf("%s\n%s\n", mx, my);
}
return 0;
}
文章定位: