# Algorithm Implementation/Strings/Dice's coefficient

(Redirected from Algorithm implementation/Strings/Dice's coefficient)
Jump to: navigation, search

Dice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.

## C++

```/*
* dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
* (C) 2007 Francis Tyers
* Modifications made by Stefan Koshiw 2010
* Now it outputs values [0..1]
* Released under the terms of the GNU GPL.
*/

float dice_coefficient(wstring string1, wstring string2)
{

set<string> string1_bigrams;
set<string> string2_bigrams;

//base case
if(string1.length() == 0 || string2.length() == 0)
{
return 0;
}

for(unsigned int i = 0; i < (string1.length() - 1); i++) {      // extract character bigrams from string1
string1_bigrams.insert(string1.substr(i, 2));
}
for(unsigned int i = 0; i < (string2.length() - 1); i++) {      // extract character bigrams from string2
string2_bigrams.insert(string2.substr(i, 2));
}

int intersection = 0;

// find the intersection between the two sets

for(set<string>::iterator IT = string2_bigrams.begin();
IT != string2_bigrams.end();
IT++)
{
intersection += string1_bigrams.count((*IT));
}

// calculate dice coefficient
int total = string1_bigrams.size() + string2_bigrams.size();
float dice = (float)(intersection * 2) / (float)total;

return dice;
}
```

## Haskell

a naive, non-optimized version

```import	Data.List	(intersect, nub)

diceCoefficient sa sb =
fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams)
where
agrams		= bigrams sa
bgrams		= bigrams sb
bigrams xs	= nub \$ zip xs (tail xs)
```

## Java

```import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;

//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
Set<String> nx = new HashSet<String>();
Set<String> ny = new HashSet<String>();

for (int i=0; i < s1.length()-1; i++) {
char x1 = s1.charAt(i);
char x2 = s1.charAt(i+1);
String tmp = "" + x1 + x2;
nx.add(tmp);
}
for (int j=0; j < s2.length()-1; j++) {
char y1 = s2.charAt(j);
char y2 = s2.charAt(j+1);
String tmp = "" + y1 + y2;
ny.add(tmp);
}

Set<String> intersection = new HashSet<String>(nx);
intersection.retainAll(ny);
double totcombigrams = intersection.size();

return (2*totcombigrams) / (nx.size()+ny.size());
}

/**
* Here's an optimized version of the dice coefficient calculation. It takes
* advantage of the fact that a bigram of 2 chars can be stored in 1 int, and
* applies a matching algorithm of O(n*log(n)) instead of O(n*n).
*
* <p>Note that, at the time of writing, this implementation differs from the
* other implementations on this page. Where the other algorithms incorrectly
* store the generated bigrams in a set (discarding duplicates), this
* implementation actually treats multiple occurrences of a bigram as unique.
* The correctness of this behavior is most easily seen when getting the
* similarity between "GG" and "GGGGGGGG", which should obviously not be 1.
*
* @param s The first string
* @param t The second String
* @return The dice coefficient between the two input strings. Returns 0 if one
*         or both of the strings are {@code null}. Also returns 0 if one or both
*         of the strings contain less than 2 characters and are not equal.
* @author Jelle Fresen
*/
public static double diceCoefficientOptimized(String s, String t)
{
// Verifying the input:
if (s == null || t == null)
return 0;
// Quick check to catch identical objects:
if (s == t)
return 1;
// avoid exception for single character searches
if (s.length() < 2 || t.length() < 2)
return 0;

// Create the bigrams for string s:
final int n = s.length()-1;
final int[] sPairs = new int[n];
for (int i = 0; i <= n; i++)
if (i == 0)
sPairs[i] = s.charAt(i) << 16;
else if (i == n)
sPairs[i-1] |= s.charAt(i);
else
sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16;

// Create the bigrams for string t:
final int m = t.length()-1;
final int[] tPairs = new int[m];
for (int i = 0; i <= m; i++)
if (i == 0)
tPairs[i] = t.charAt(i) << 16;
else if (i == m)
tPairs[i-1] |= t.charAt(i);
else
tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16;

// Sort the bigram lists:
Arrays.sort(sPairs);
Arrays.sort(tPairs);

// Count the matches:
int matches = 0, i = 0, j = 0;
while (i < n && j < m)
{
if (sPairs[i] == tPairs[j])
{
matches += 2;
i++;
j++;
}
else if (sPairs[i] < tPairs[j])
i++;
else
j++;
}
return (double)matches/(n+m);
}
```

## Javascript

```function dice_coefficient(string1, string2) {
var intersection = 0;
var length1 = string1.length - 1;
var length2 = string2.length - 1;
if(length1 < 1 || length2 < 1) return 0;
var bigrams2 = [];
for(var i = 0; i < length2; i++) {
bigrams2.push(string2.substr(i,2));
}
for(var i = 0; i < length1; i++) {
var bigram1 = string1.substr(i, 2);
for(var j = 0; j < length2; j++) {
if(bigram1 == bigrams2[j]) {
intersection++;
bigrams2[j] = null;
break;
}}}
return (2.0 * intersection) / (length1 + length2);
}
```

## Perl

```sub dice_coefficient(){
my (\$str1, \$str2) = @_;

return 0 if (! length \$str1 || ! length \$str2 );

#### bigrams using REGEX.
##my @bigrams_str1 = \$str1 =~ m/ (?= (..) ) /xmsg;
##my @bigrams_str2 = \$str2 =~ m/ (?= (..) ) /xmsg;

my \$len1 = ( length(\$str1) -  1 );
for (my \$i=0; \$i<\$len1; \$i++){
push @bigrams_str1, substr(\$str1, \$i, 2);
}

my \$len2 = ( length(\$str2) -  1 );
for (my \$i=0; \$i<\$len2; \$i++){
push @bigrams_str2, substr(\$str2, \$i, 2);
}

my \$intersect = 0;
for my \$bigram (@bigrams_str1){
if ( grep {if (/^\$bigram\$/) {\$_=""; 1} else {0}} @bigrams_str2){
\$intersect++;
}
}

my \$combinedLength = (\$#bigrams_str1+1 + \$#bigrams_str2+1);
my \$dice = (\$intersect * 2 / \$combinedLength);

return \$dice;
}
```

## Python

```def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
a_bigrams = set(a)
b_bigrams = set(b)
overlap = len(a_bigrams & b_bigrams)
return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))

""" more orthodox and robust implementation """
def dice_coefficient(a, b):
"""dice coefficient 2nt/na + nb."""
if not len(a) or not len(b): return 0.0
if len(a) == 1:  a=a+u'.'
if len(b) == 1:  b=b+u'.'

a_bigram_list=[]
for i in range(len(a)-1):
a_bigram_list.append(a[i:i+2])
b_bigram_list=[]
for i in range(len(b)-1):
b_bigram_list.append(b[i:i+2])

a_bigrams = set(a_bigram_list)
b_bigrams = set(b_bigram_list)
overlap = len(a_bigrams & b_bigrams)
dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
return dice_coeff

""" duplicate bigrams in a word should be counted distinctly
(per discussion), otherwise 'AA' and 'AAAA' would have a
dice coefficient of 1...
"""
def dice_coefficient(a,b):
if not len(a) or not len(b): return 0.0
""" quick case for true duplicates """
if a == b: return 1.0
""" if a != b, and a or b are single chars, then they can't possibly match """
if len(a) == 1 or len(b) == 1: return 0.0

""" use python list comprehension, preferred over list.append() """
a_bigram_list = [a[i:i+2] for i in range(len(a)-1)]
b_bigram_list = [b[i:i+2] for i in range(len(b)-1)]

a_bigram_list.sort()
b_bigram_list.sort()

# assignments to save function calls
lena = len(a_bigram_list)
lenb = len(b_bigram_list)
# initialize match counters
matches = i = j = 0
while (i < lena and j < lenb):
if a_bigram_list[i] == b_bigram_list[j]:
matches += 2
i += 1
j += 1
elif a_bigram_list[i] < b_bigram_list[j]:
i += 1
else:
j += 1

score = float(matches)/float(lena + lenb)
return score
```

## Ruby

```require 'set'

# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
a_bigrams = a.each_char.each_cons(2).to_set
b_bigrams = b.each_char.each_cons(2).to_set

overlap = (a_bigrams & b_bigrams).size

total = a_bigrams.size + b_bigrams.size
dice  = overlap * 2.0 / total

dice
end
```

## C#

```public static double DiceCoefficient(string stOne, string stTwo)
{
HashSet<string> nx = new HashSet<string>();
HashSet<string> ny = new HashSet<string>();

for(int i = 0; i < stOne.Length - 1; i++)
{
char x1 = stOne[i];
char x2 = stOne[i + 1];
string temp = x1.ToString() + x2.ToString();
nx.Add(temp);
}
for(int j = 0; j < stTwo.Length - 1; j++)
{
char y1 = stTwo[j];
char y2 = stTwo[j + 1];
string temp = y1.ToString() + y2.ToString();
ny.Add(temp);
}

HashSet<string> intersection = new HashSet<string>(nx);
intersection.IntersectWith(ny);

double dbOne = intersection.Count;
return (2 * dbOne) / (nx.Count + ny.Count);

}
```