Algorithm Implementation/Strings/Longest common substring
From Wikibooks, the open-content textbooks collection
Note to reader: It is unavoidable for this algorithm that O(nm) time is used, but all of these implementations also use O(nm) storage. The astute reader will notice that only the previous column of the grid storing the dynamic state is ever actually used in computing the next column. Thus, these algorithm can be altered to have only an O(n) storage requirement. By reassigning array references between two 1D arrays, this can be done without copying the state data from one array to another. I may return later and update this page accordingly; for now, this optimization is left as an exercise to the reader.
Contents |
[edit] C#
[edit] Length of Longest Substring
Given two non-empty strings as parameters, this method will return the length of the longest substring common to both parameters. A variant, below, returns the actual string.
public int LongestCommonSubstring(string str1, string str2) { if (String.IsNullOrEmpty(str1) || String.IsNullOrEmpty(str2)) return 0; int[,] num = new int[str1.Length, str2.Length]; int maxlen = 0; for (int i = 0; i < str1.Length; i++) { for (int j = 0; j < str2.Length; j++) { if (str1[i] != str2[j]) num[i, j] = 0; else { if ((i == 0) || (j == 0)) num[i, j] = 1; else num[i, j] = 1 + num[i - 1, j - 1]; if (num[i, j] > maxlen) { maxlen = num[i, j]; } } } } return maxlen; }
[edit] Retrieve the Longest Substring
This example uses the out keyword to pass in a string refrence which the method will set to a string containing the longest common substring.
public int LongestCommonSubstring(string str1, string str2, out string sequence) { sequence = string.Empty; if (String.IsNullOrEmpty(str1) || String.IsNullOrEmpty(str2)) return 0; int[,] num = new int[str1.Length, str2.Length]; int maxlen = 0; int lastSubsBegin = 0; StringBuilder sequenceBuilder = new StringBuilder(); for (int i = 0; i < str1.Length; i++) { for (int j = 0; j < str2.Length; j++) { if (str1[i] != str2[j]) num[i, j] = 0; else { if ((i == 0) || (j == 0)) num[i, j] = 1; else num[i, j] = 1 + num[i - 1, j - 1]; if (num[i, j] > maxlen) { maxlen = num[i, j]; int thisSubsBegin = i - num[i, j] + 1; if (lastSubsBegin == thisSubsBegin) {//if the current LCS is the same as the last time this block ran sequenceBuilder.Append(str1[i]); } else //this block resets the string builder if a different LCS is found { lastSubsBegin = thisSubsBegin; sequenceBuilder.Remove(0, sequenceBuilder.Length);//clear it sequenceBuilder.Append(str1.Substring(lastSubsBegin, (i + 1) - lastSubsBegin)); } } } } } sequence = sequenceBuilder.ToString(); return maxlen; }
The extra complexity in this method keeps the number of new String objects created to a minimum. This is important in C# because, since strings are immutable: every time a string field is assigned to, the old string sits in memory untill the garbage collector runs. Therefore some effort was put into keeping the number of new strings low.
The algorithm might be simplified (left as an exercise to the reader) by tracking only the start position (in, say str1, or both str1 and str2) of the string, and leaving it to the caller to extract the string using this and the returned length. Such a variant may prove more useful, too, as the actual locations in the subject strings would be identified.
// odev 1.cpp : Defines the entry point for the console application. // #include "stdafx.h" #include "iostream" using namespace std; char **A; int _tmain(int argc, _TCHAR* argv[]) { int satir, sira=1; cin >> satir; A = new char *[satir]; for(int i=0; i<satir; i++) *(A+i)= new char [sira]; for(int j=1; j<=satir; j++) cin >> A[0][j]; for(int j=1; j<=satir; j++) cout << A[0][j]; cin >> sira; for(int i=1; i<=sira; i++) cin >> A[i][0]; for(int j=1; j<=satir; j++) cout << A[0][j]; for(int i=1;i<=sira;i++){ for(int j=1; j<=satir; j++){ if(A[0][j]==A[i][0]){ cout << A[i][0]; } } } return 0; }
[edit] Python
def LCSubstr_len(S, T): m = len(S); n = len(T) L = [[0] * (n+1) for i in xrange(m+1)] lcs = 0 for i in xrange(m): for j in xrange(n): if S[i] == T[j]: L[i+1][j+1] = L[i][j] + 1 lcs = max(lcs, L[i+1][j+1]) return lcs def LCSubstr_set(S, T): m = len(S); n = len(T) L = [[0] * (n+1) for i in xrange(m+1)] LCS = set() longest = 0 for i in xrange(m): for j in xrange(n): if S[i] == T[j]: v = L[i][j] + 1 L[i+1][j+1] = v if v > longest: longest = v LCS = set() if v == longest: LCS.add(S[i-v+1:i+1]) return LCS
[edit] Perl
sub lc_substr { my ($str1, $str2) = @_; my $l_length = 0; # length of longest common substring my $len1 = length $str1; my $len2 = length $str2; my @char1 = (undef, split(//, $str1)); # $str1 as array of chars, indexed from 1 my @char2 = (undef, split(//, $str2)); # $str2 as array of chars, indexed from 1 my @lc_suffix; # "longest common suffix" table my @substrings; # list of common substrings of length $l_length for my $n1 ( 1 .. $len1 ) { for my $n2 ( 1 .. $len2 ) { if ($char1[$n1] eq $char2[$n2]) { # We have found a matching character. Is this the first matching character, or a # continuation of previous matching characters? If the former, then the length of # the previous matching portion is undefined; set to zero. $lc_suffix[$n1-1][$n2-1] ||= 0; # In either case, declare the match to be one character longer than the match of # characters preceding this character. $lc_suffix[$n1][$n2] = $lc_suffix[$n1-1][$n2-1] + 1; # If the resulting substring is longer than our previously recorded max length ... if ($lc_suffix[$n1][$n2] > $l_length) { # ... we record its length as our new max length ... $l_length = $lc_suffix[$n1][$n2]; # ... and clear our result list of shorter substrings. @substrings = (); } # If this substring is equal to our longest ... if ($lc_suffix[$n1][$n2] == $l_length) { # ... add it to our list of solutions. push @substrings, substr($str1, ($n1-$l_length), $l_length); } } } } return @substrings; }
[edit] VB.NET
Public Function LongestCommonSubstring(ByVal s1 As String, ByVal s2 As String) As Integer Dim num(s1.Length - 1, s2.Length - 1) As Integer '2D array Dim letter1 As Char = Nothing Dim letter2 As Char = Nothing Dim len As Integer = 0 Dim ans As Integer = 0 For i As Integer = 0 To s1.Length - 1 For j As Integer = 0 To s2.Length - 1 letter1 = s1.Chars(i) letter2 = s2.Chars(j) If Not letter1.Equals(letter2) Then num(i, j) = 0 Else If i.Equals(0) Or j.Equals(0) Then num(i, j) = 1 Else num(i, j) = 1 + num(i - 1, j - 1) End If If num(i, j) > len Then len = num(i, j) ans = num(i, j) End If End If Next j Next i Return ans End Function
[edit] C++
We need to #include <string> because we use its std::string class.
int LongestCommonSubstring(const string& str1, const string& str2) { if(str1.empty() || str2.empty()) { return 0; } int *curr = new int [str2.size()]; int *prev = new int [str2.size()]; int *swap = NULL; int maxSubstr = 0; for(int i = 0; i<str1.size(); ++i) { for(int j = 0; j<str2.size(); ++j) { if(str1[i] != str2[j]) { curr[j] = 0; } else { if(i == 0 || j == 0) { curr[j] = 1; } else { curr[j] = 1 + prev[j-1]; } //The next if can be replaced with: //maxSubstr = max(maxSubstr, curr[j]); //(You need algorithm.h library for using max()) if(maxSubstr < curr[j]) { maxSubstr = curr[j]; } } } swap=curr; curr=prev; prev=swap; } delete [] curr; delete [] prev; return maxSubstr; }
[edit] Ruby
def lcs(s1, s2) num=Array.new(s1.size){Array.new(s2.size)} len,ans=0 s1.scan(/./).each_with_index do |l1,i | s2.scan(/./).each_with_index do |l2,j | unless l1==l2 num[i][j]=0 else (i==0 || j==0)? num[i][j]=1 : num[i][j]=1 + num[i-1][j-1] len = ans = num[i][j] if num[i][j] > len end end end ans end
[edit] Java
public int longestSubstr(String str_, String toCompare_) { if (str_.isEmpty() || toCompare_.isEmpty()) return 0; int[][] compareTable = new int[str_.length()][toCompare_.length()]; int maxLen = 0; for (int m = 0; m < str_.length(); m++) { for (int n = 0; n < toCompare_.length(); n++) { compareTable[m][n] = (str_.charAt(m) != toCompare_.charAt(n)) ? 0 : (((m == 0) || (n == 0)) ? 1 : compareTable[m - 1][n - 1] + 1); maxLen = (compareTable[m][n] > maxLen) ? compareTable[m][n] : maxLen; } } return maxLen; }
[edit] PHP
function strlcs($str1, $str2){ $m = strlen($str1); $n = strlen($str2); $L = array(); $z = 0; $ret = array(); for($i=0; $i<$m; $i++){ $L[$i] = array(); for($j=0; $j<$n; $j++){ $L[$i][$j] = 0; } } for($i=0; $i<$m; $i++){ for($j=0; $j<$n; $j++){ if( $str1[$i] == $str2[$j] ){ $L[$i][$j] = $L[$i-1][$j-1] + 1; if( $L[$i][$j] > $z ){ $z = $L[$i][$j]; $ret = array(); } if( $L[$i][$j] == $z ) $ret[] = substr($str1, $i-$z+1, $z); } } } return $ret; }