Algorithm Implementation/Strings/Dice's coefficient
From Wikibooks, open books for an open world
< Algorithm Implementation | Strings(Redirected from Algorithm implementation/Strings/Dice's coefficient)
See Wikipedia article for explanation.
Contents |
[edit] C++
/* * dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b * (C) 2007 Francis Tyers * Modifications made by Stefan Koshiw 2010 * Now it outputs values [0..1] * Released under the terms of the GNU GPL. */ float dice_coefficient(wstring string1, wstring string2) { set<string> string1_bigrams; set<string> string2_bigrams; //base case if(string1.length() == 0 || string2.length() == 0) { return 0; } for(unsigned int i = 0; i < (string1.length() - 1); i++) { // extract character bigrams from string1 string1_bigrams.insert(string1.substr(i, 2)); } for(unsigned int i = 0; i < (string2.length() - 1); i++) { // extract character bigrams from string2 string2_bigrams.insert(string2.substr(i, 2)); } int intersection = 0; // find the intersection between the two sets for(set<string>::iterator IT = string2_bigrams.begin(); IT != string2_bigrams.end(); IT++) { intersection += string1_bigrams.count((*IT)); } // calculate dice coefficient int total = string1_bigrams.size() + string2_bigrams.size(); float dice = (float)(intersection * 2) / (float)total; return dice; }
[edit] Java
import java.util.HashSet; import java.util.Set; //Note that this implementation is case-sensitive! public static double diceCoefficient(String s1, String s2) { Set<String> nx = new HashSet<String>(); Set<String> ny = new HashSet<String>(); for (int i=0; i < s1.length()-1; i++) { char x1 = s1.charAt(i); char x2 = s1.charAt(i+1); String tmp = "" + x1 + x2; nx.add(tmp); } for (int j=0; j < s2.length()-1; j++) { char y1 = s2.charAt(j); char y2 = s2.charAt(j+1); String tmp = "" + y1 + y2; ny.add(tmp); } Set<String> intersection = new HashSet<String>(nx); intersection.retainAll(ny); double totcombigrams = intersection.size(); return (2*totcombigrams) / (nx.size()+ny.size()); }
[edit] Perl
sub dice_coefficient(){ my ($str1, $str2) = @_; return 0 if (! length $str1 || ! length $str2 ); #### bigrams using REGEX. ##my @bigrams_str1 = $str1 =~ m/ (?= (..) ) /xmsg; ##my @bigrams_str2 = $str2 =~ m/ (?= (..) ) /xmsg; my $len1 = ( length($str1) - 1 ); for (my $i=0; $i<$len1; $i++){ push @bigrams_str1, substr($str1, $i, 2); } my $len2 = ( length($str2) - 1 ); for (my $i=0; $i<$len2; $i++){ push @bigrams_str1, substr($str2, $i, 2); } my $intersect = 0; for (my $i=0; $i<=$#bigrams_str1; $i++){ if ( grep /^$bigrams_str1[$i]$/, @bigrams_str2){ $intersect++; } } my $combinedLength = ($#bigram_str1+1 + $#bigrams_str2+1); my $dice = ($intersect * 2 / $combinedLength); return $dice; }
[edit] Python
def dice_coefficient(a, b): """dice coefficient 2nt/na + nb.""" a_bigrams = set(a) b_bigrams = set(b) overlap = len(a_bigrams & b_bigrams) return overlap * 2.0/(len(a_bigrams) + len(b_bigrams)) """ more orthodox and robust implementation """ def dice_coefficient(a, b): """dice coefficient 2nt/na + nb.""" if not len(a) or not len(b): return 0.0 if len(a) == 1: a=a+u'.' if len(b) == 1: b=b+u'.' a_bigram_list=[] for i in range(len(a)-1): a_bigram_list.append(a[i:i+2]) b_bigram_list=[] for i in range(len(b)-1): b_bigram_list.append(b[i:i+2]) a_bigrams = set(a_bigram_list) b_bigrams = set(b_bigram_list) overlap = len(a_bigrams & b_bigrams) dice_coeff = overlap * 2.0/(len(a_bigrams) + len(b_bigrams)) return dice_coeff
[edit] Ruby
require 'set' # dice coefficient = bigram overlap * 2 / bigrams in a + bigrams in b def dice_coefficient(a, b) a_bigrams = a.each_char.each_cons(2).to_set b_bigrams = b.each_char.each_cons(2).to_set overlap = (a_bigrams & b_bigrams).size total = a_bigrams.size + b_bigrams.size dice = overlap * 2.0 / total dice end