XQuery/Tag Cloud

From Wikibooks, the open-content textbooks collection

< XQuery
Jump to: navigation, search

Contents

[edit] Counting Words

[edit] Total of words in a text object

From Jon Robie's blog

declare variable $doc external;

let $txt := string-join( $doc//text() , " ")
return
  count(tokenize($txt,'(\s|[,.!:;]|[n][b][s][p][;])+'))

Note that the string-join() function takes an input sequence and returns a single string that is separated by spaces (the second argument of string-join).

If you want to see what this counts as a word in your document, use the following variation.

declare variable $doc external;
let $txt := string-join( $doc//text() , " ")
return   <words count="{count(tokenize($txt,'(\s|[,.!:;]|[n][b][s][p][;])+'))}">
     { $txt }
  </words>

Another variation is the word-count() function found at xqueryfucntions.com:

declare function local:word-count( $arg as xs:string? )  as xs:integer {       
   count(tokenize($arg, '\W+')[. != ''])
 } ;

This version uses the \W+ regular expression to return word tokens.

[edit] Counting Keywords

Kurt Cagle suggested the following XQuery for counting keywords:

declare namespace xqwb="http://xquery.wikibooks.org";
declare option exist:serialize "method=xml media-type=text/xml indent=yes";

declare function xqwb:word-count($wordlist as element() ) as element() {
<terms>
   {for $term in distinct-values($wordlist/term)
    let $term-count := count($wordlist/term[.  = $term])
    return 
     <term count="{$term-count}">{$term}</term>
   }
</terms>
};

let $keywords := 
<keywords>
   <term>red</term>
   <term>green</term>
   <term>red</term>
   <term>blue</term>
   <term>violet</term>
   <term>red</term>
   <term>blue</term>
   <term>blue</term>
   <term>red</term>
   <term>orange</term>
   <term>green</term>
   <term>yellow</term>
   <term>indigo</term>
   <term>red</term>
</keywords>

let $result := xqwb:word-count($keywords)
return $result

[Execute]

[edit] This Returns the Following

<terms>
    <term count="5">red</term>
    <term count="2">green</term>
    <term count="3">blue</term>
    <term count="1">violet</term>
    <term count="1">orange</term>
    <term count="1">yellow</term>
    <term count="1">indigo</term>
</terms>

[edit] Creating a Tag Cloud

From this you can create a Tag Cloud or word density map such as the "Popular Tags" link on the flickr web site Flicker Popular Tags

declare namespace xqwb="http://xquery.wikibooks.org";
declare option exist:serialize "method=xhtml media-type=text/html indent=yes";

declare function xqwb:word-count($wordlist as element() ) as element() {
<terms>
   {for $term in distinct-values($wordlist/term)
    let $term-count := count($wordlist/term[.  = $term])
    return 
       <term count="{$term-count}">{$term}</term>
   }
 </terms>
};


let $keywords := 
<keywords>
   <term>red</term>
   <term>green</term>
   <term>red</term>
   <term>blue</term>
   <term>violet</term>
   <term>red</term>
   <term>blue</term>
   <term>blue</term>
   <term>red</term>
   <term>orange</term>
   <term>green</term>
   <term>yellow</term>
   <term>indigo</term>
   <term>red</term>
</keywords>

let $result := xqwb:word-count($keywords)
let $total := count($keywords/term)
let $scale := 20

return 
 <div>
  {
  for $term in $result/term
  let $fontSize := round( $term/@count div $total * 100 * $scale)
  order by $term
  return <span style="font-size:{$fontSize}%">{string($term)} </span>
  }
 </div>

Execute