Algorithm Implementation/Strings/Dice's coefficient

From Wikibooks, open books for an open world
< Algorithm Implementation‎ | Strings
Jump to: navigation, search

Dice's coefficient measures how similar a set and another set are. It can be used to measure how similar two strings are in terms of the number of common bigrams (a bigram is a pair of adjacent letters in the string). Below we give implementations of Dice's coefficient of two strings in different programming languages.

C++[edit]

/*
 * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
 * (C) 2007 Francis Tyers 
 * Modifications made by Stefan Koshiw 2010
 * Now it outputs values [0..1]
 * Released under the terms of the GNU GPL.
 */
 
float dice_coefficient(wstring string1, wstring string2)
{
 
        set<string> string1_bigrams;
        set<string> string2_bigrams;
 
        //base case
        if(string1.length() == 0 || string2.length() == 0)
        {
                return 0;
        }        
 
        for(unsigned int i = 0; i < (string1.length() - 1); i++) {      // extract character bigrams from string1
                string1_bigrams.insert(string1.substr(i, 2));
        }
        for(unsigned int i = 0; i < (string2.length() - 1); i++) {      // extract character bigrams from string2
                string2_bigrams.insert(string2.substr(i, 2));
        }
 
        int intersection = 0;
 
        // find the intersection between the two sets
 
        for(set<string>::iterator IT = string2_bigrams.begin(); 
            IT != string2_bigrams.end(); 
            IT++)
        {    
                intersection += string1_bigrams.count((*IT));
        }
 
        // calculate dice coefficient
        int total = string1_bigrams.size() + string2_bigrams.size();
        float dice = (float)(intersection * 2) / (float)total;
 
        return dice;
}


Haskell[edit]

a naive, non-optimized version

import	Data.List	(intersect, nub)
 
diceCoefficient sa sb =
	fromIntegral (2 * length (intersect agrams bgrams)) / fromIntegral (length agrams + length bgrams)	
	where
		agrams		= bigrams sa
		bgrams		= bigrams sb
		bigrams xs	= nub $ zip xs (tail xs)

Java[edit]

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
 
//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
	Set<String> nx = new HashSet<String>();
	Set<String> ny = new HashSet<String>();
 
	for (int i=0; i < s1.length()-1; i++) {
		char x1 = s1.charAt(i);
		char x2 = s1.charAt(i+1);
		String tmp = "" + x1 + x2;
		nx.add(tmp);
	}
	for (int j=0; j < s2.length()-1; j++) {
		char y1 = s2.charAt(j);
		char y2 = s2.charAt(j+1);
		String tmp = "" + y1 + y2;
		ny.add(tmp);
	}
 
	Set<String> intersection = new HashSet<String>(nx);
	intersection.retainAll(ny);
	double totcombigrams = intersection.size();
 
	return (2*totcombigrams) / (nx.size()+ny.size());
}
 
/**
 * Here's an optimized version of the dice coefficient calculation. It takes
 * advantage of the fact that a bigram of 2 chars can be stored in 1 int, and
 * applies a matching algorithm of O(n*log(n)) instead of O(n*n).
 * 
 * <p>Note that, at the time of writing, this implementation differs from the
 * other implementations on this page. Where the other algorithms incorrectly
 * store the generated bigrams in a set (discarding duplicates), this
 * implementation actually treats multiple occurrences of a bigram as unique.
 * The correctness of this behavior is most easily seen when getting the
 * similarity between "GG" and "GGGGGGGG", which should obviously not be 1.
 * 
 * @param s The first string
 * @param t The second String
 * @return The dice coefficient between the two input strings. Returns 0 if one
 *         or both of the strings are {@code null}. Also returns 0 if one or both
 *         of the strings contain less than 2 characters and are not equal.
 * @author Jelle Fresen
 */
public static double diceCoefficientOptimized(String s, String t)
{
	// Verifying the input:
	if (s == null || t == null)
		return 0;
	// Quick check to catch identical objects:
	if (s == t)
		return 1;
        // avoid exception for single character searches
        if (s.length() < 2 || t.length() < 2)
            return 0;
 
	// Create the bigrams for string s:
	final int n = s.length()-1;
	final int[] sPairs = new int[n];
	for (int i = 0; i <= n; i++)
		if (i == 0)
			sPairs[i] = s.charAt(i) << 16;
		else if (i == n)
			sPairs[i-1] |= s.charAt(i);
		else
			sPairs[i] = (sPairs[i-1] |= s.charAt(i)) << 16;
 
	// Create the bigrams for string t:
	final int m = t.length()-1;
	final int[] tPairs = new int[m];
	for (int i = 0; i <= m; i++)
		if (i == 0)
			tPairs[i] = t.charAt(i) << 16;
		else if (i == m)
			tPairs[i-1] |= t.charAt(i);
		else
			tPairs[i] = (tPairs[i-1] |= t.charAt(i)) << 16;
 
	// Sort the bigram lists:
	Arrays.sort(sPairs);
	Arrays.sort(tPairs);
 
	// Count the matches:
	int matches = 0, i = 0, j = 0;
	while (i < n && j < m)
	{
		if (sPairs[i] == tPairs[j])
		{
			matches += 2;
			i++;
			j++;
		}
		else if (sPairs[i] < tPairs[j])
			i++;
		else
			j++;
	}
	return (double)matches/(n+m);
}

Javascript[edit]

function dice_coefficient(string1, string2) {
  var intersection = 0;
  var length1 = string1.length - 1;
  var length2 = string2.length - 1;
  if(length1 < 1 || length2 < 1) return 0;
  var bigrams2 = [];
  for(var i = 0; i < length2; i++) {
    bigrams2.push(string2.substr(i,2));
  }
  for(var i = 0; i < length1; i++) {
    var bigram1 = string1.substr(i, 2);
    for(var j = 0; j < length2; j++) {
      if(bigram1 == bigrams2[j]) {
        intersection++;
        bigrams2[j] = null;
        break;
  }}} 
  return (2.0 * intersection) / (length1 + length2);  
}

Perl[edit]

sub dice_coefficient(){
     my ($str1, $str2) = @_;
 
     return 0 if (! length $str1 || ! length $str2 );
 
     #### bigrams using REGEX.
     ##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg;
     ##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg;
 
     my $len1 = ( length($str1) -  1 );
     for (my $i=0; $i<$len1; $i++){
         push @bigrams_str1, substr($str1, $i, 2);
     }
 
     my $len2 = ( length($str2) -  1 );
     for (my $i=0; $i<$len2; $i++){
         push @bigrams_str2, substr($str2, $i, 2);
     }
 
     my $intersect = 0;
     for my $bigram (@bigrams_str1){
          if ( grep {if (/^$bigram$/) {$_=""; 1} else {0}} @bigrams_str2){
               $intersect++;
          }
     }
 
     my $combinedLength = ($#bigrams_str1+1 + $#bigrams_str2+1);
     my $dice = ($intersect * 2 / $combinedLength);
 
     return $dice;
}

Python[edit]

def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    a_bigrams = set(a)
    b_bigrams = set(b)
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
 
""" more orthodox and robust implementation """
def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    if not len(a) or not len(b): return 0.0
    if len(a) == 1:  a=a+u'.'
    if len(b) == 1:  b=b+u'.'
 
    a_bigram_list=[]
    for i in range(len(a)-1):
      a_bigram_list.append(a[i:i+2])
    b_bigram_list=[]
    for i in range(len(b)-1):
      b_bigram_list.append(b[i:i+2])
 
    a_bigrams = set(a_bigram_list)
    b_bigrams = set(b_bigram_list)
    overlap = len(a_bigrams & b_bigrams)
    dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
    return dice_coeff
 
""" duplicate bigrams in a word should be counted distinctly
(per discussion), otherwise 'AA' and 'AAAA' would have a 
dice coefficient of 1...
"""
def dice_coefficient(a,b):
    if not len(a) or not len(b): return 0.0
    """ quick case for true duplicates """
    if a == b: return 1.0
    """ if a != b, and a or b are single chars, then they can't possibly match """
    if len(a) == 1 or len(b) == 1: return 0.0
 
    """ use python list comprehension, preferred over list.append() """
    a_bigram_list = [a[i:i+2] for i in range(len(a)-1)]
    b_bigram_list = [b[i:i+2] for i in range(len(b)-1)]
 
    a_bigram_list.sort()
    b_bigram_list.sort()
 
    # assignments to save function calls
    lena = len(a_bigram_list)
    lenb = len(b_bigram_list)
    # initialize match counters
    matches = i = j = 0
    while (i < lena and j < lenb):
        if a_bigram_list[i] == b_bigram_list[j]:
            matches += 2
            i += 1
            j += 1
        elif a_bigram_list[i] < b_bigram_list[j]:
            i += 1
        else:
            j += 1
 
    score = float(matches)/float(lena + lenb)
    return score

Objective C[edit]

int compare (const void * a, const void * b)
{
    return ( *(int*)a - *(int*)b );
}
 
+(float)diceCoefficient:(NSString*)s s2:(NSString*)t
{
    // Verifying the input:
    if (s == nil || t == nil) return 0;
    // Quick check to catch identical objects:
    if (s == t) return 1;
    // avoid exception for single character searches
    if (s.length < 2 || t.length < 2) return 0;
 
    // Create the bigrams for string s:
    int n = s.length-1;
    int sPairs[n];
    for (int i = 0; i <= n; i++)
        if (i == 0) sPairs[i] = [s characterAtIndex:i] << 16;
        else if (i == n) sPairs[i-1] |= [s characterAtIndex:i];
        else sPairs[i] = (sPairs[i-1] |= [s characterAtIndex:i]) << 16;
 
    // Create the bigrams for string t:
    int m = t.length-1;
    int tPairs[m];
    for (int i = 0; i <= m; i++)
        if (i == 0) tPairs[i] = [t characterAtIndex:i] << 16;
        else if (i == m) tPairs[i-1] |= [t characterAtIndex:i];
        else tPairs[i] = (tPairs[i-1] |= [t characterAtIndex:i]) << 16;
 
    // Sort the bigram lists:
    qsort(sPairs, n, sizeof(int), compare);
    qsort(tPairs, m, sizeof(int), compare);
 
    // Count the matches:
    int matches = 0, i = 0, j = 0;
    while (i < n && j < m) {
        if (sPairs[i] == tPairs[j]) {
            matches += 2;
            i++;
            j++;
        } else
            if (sPairs[i] < tPairs[j]) i++; else j++;
    }
    return (float)matches/(n+m);
}

Ruby[edit]

require 'set'
 
# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
        a_bigrams = a.each_char.each_cons(2).to_set
        b_bigrams = b.each_char.each_cons(2).to_set
 
        overlap = (a_bigrams & b_bigrams).size
 
        total = a_bigrams.size + b_bigrams.size
        dice  = overlap * 2.0 / total
 
        dice
end

C#[edit]

public static double DiceCoefficient(string stOne, string stTwo)
        {
            HashSet<string> nx = new HashSet<string>();
            HashSet<string> ny = new HashSet<string>();
 
            for(int i = 0; i < stOne.Length - 1; i++)
            {
                char x1 = stOne[i];
                char x2 = stOne[i + 1];
                string temp = x1.ToString() + x2.ToString();
                nx.Add(temp);
            }
            for(int j = 0; j < stTwo.Length - 1; j++)
            {
                char y1 = stTwo[j];
                char y2 = stTwo[j + 1];
                string temp = y1.ToString() + y2.ToString();
                ny.Add(temp);
            }
 
            HashSet<string> intersection = new HashSet<string>(nx);
            intersection.IntersectWith(ny);
 
            double dbOne = intersection.Count;
            return (2 * dbOne) / (nx.Count + ny.Count);
 
        }

J[edit]

bigrams =: (_1}.])(<@,"0)(1}.])
 
dice_coefficient =: (2&*@~.@%@#)@(bigrams,bigrams)