Algorithm Implementation/Strings/Dice's coefficient

From Wikibooks, open books for an open world
Jump to: navigation, search

See Wikipedia article for explanation.

Contents

[edit] C++

/*
 * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
 * (C) 2007 Francis Tyers 
 * Modifications made by Stefan Koshiw 2010
 * Now it outputs values [0..1]
 * Released under the terms of the GNU GPL.
 */
 
float dice_coefficient(wstring string1, wstring string2)
{
 
        set<string> string1_bigrams;
        set<string> string2_bigrams;
 
        //base case
        if(string1.length() == 0 || string2.length() == 0)
        {
                return 0;
        }        
 
        for(unsigned int i = 0; i < (string1.length() - 1); i++) {      // extract character bigrams from string1
                string1_bigrams.insert(string1.substr(i, 2));
        }
        for(unsigned int i = 0; i < (string2.length() - 1); i++) {      // extract character bigrams from string2
                string2_bigrams.insert(string2.substr(i, 2));
        }
 
        int intersection = 0;
 
        // find the intersection between the two sets
 
        for(set<string>::iterator IT = string2_bigrams.begin(); 
            IT != string2_bigrams.end(); 
            IT++)
        {    
                intersection += string1_bigrams.count((*IT));
        }
 
        // calculate dice coefficient
        int total = string1_bigrams.size() + string2_bigrams.size();
        float dice = (float)(intersection * 2) / (float)total;
 
        return dice;
}

[edit] Java

import java.util.HashSet;
import java.util.Set;
 
//Note that this implementation is case-sensitive!
public static double diceCoefficient(String s1, String s2)
{
        Set<String> nx = new HashSet<String>();
        Set<String> ny = new HashSet<String>();
 
        for (int i=0; i < s1.length()-1; i++) {
                char x1 = s1.charAt(i);
                char x2 = s1.charAt(i+1);
                String tmp = "" + x1 + x2;
                nx.add(tmp);
        }
        for (int j=0; j < s2.length()-1; j++) {
                char y1 = s2.charAt(j);
                char y2 = s2.charAt(j+1);
                String tmp = "" + y1 + y2;
                ny.add(tmp);
        }
 
        Set<String> intersection = new HashSet<String>(nx);
        intersection.retainAll(ny);
        double totcombigrams = intersection.size();
 
        return (2*totcombigrams) / (nx.size()+ny.size());
}

[edit] Perl

sub dice_coefficient(){
     my ($str1, $str2) = @_;
 
     return 0 if (! length $str1 || ! length $str2 );
 
     #### bigrams using REGEX.
     ##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg;
     ##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg;
 
     my $len1 = ( length($str1) -  1 );
     for (my $i=0; $i<$len1; $i++){
         push @bigrams_str1, substr($str1, $i, 2);
     }
 
     my $len2 = ( length($str2) -  1 );
     for (my $i=0; $i<$len2; $i++){
         push @bigrams_str1, substr($str2, $i, 2);
     }
 
     my $intersect = 0;
     for (my $i=0; $i<=$#bigrams_str1; $i++){
          if ( grep /^$bigrams_str1[$i]$/, @bigrams_str2){
               $intersect++;
          }
     }
 
     my $combinedLength = ($#bigram_str1+1 + $#bigrams_str2+1);
     my $dice = ($intersect * 2 / $combinedLength);
 
     return $dice;
}

[edit] Python

def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    a_bigrams = set(a)
    b_bigrams = set(b)
    overlap = len(a_bigrams & b_bigrams)
    return overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
 
""" more orthodox and robust implementation """
def dice_coefficient(a, b):
    """dice coefficient 2nt/na + nb."""
    if not len(a) or not len(b): return 0.0
    if len(a) == 1:  a=a+u'.'
    if len(b) == 1:  b=b+u'.'
 
    a_bigram_list=[]
    for i in range(len(a)-1):
      a_bigram_list.append(a[i:i+2])
    b_bigram_list=[]
    for i in range(len(b)-1):
      b_bigram_list.append(b[i:i+2])
 
    a_bigrams = set(a_bigram_list)
    b_bigrams = set(b_bigram_list)
    overlap = len(a_bigrams & b_bigrams)
    dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams))
    return dice_coeff

[edit] Ruby

require 'set'
 
# dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b
def dice_coefficient(a, b)
        a_bigrams = a.each_char.each_cons(2).to_set
        b_bigrams = b.each_char.each_cons(2).to_set
 
        overlap = (a_bigrams & b_bigrams).size
 
        total = a_bigrams.size + b_bigrams.size
        dice  = overlap * 2.0 / total
 
        dice
end
Personal tools
Namespaces

Variants
Actions
Navigation
Community
Toolbox
Sister projects
Print/export