XQuery/SPARQL Tutorial: Difference between revisions

[unreviewed revision]

Content deleted Content added

Inline

Revision as of 07:52, 4 March 2008

The Employee -Department case study used in the comparison of SQL and XQuery is used here to explore the XQuery/SPARQL pairing. This is a work-in-progress as the writer learns about RDF, the semantic web and linked data.

Example

Taking as the starting point the XML documents defining the three tables:

These tables are dynamically converted to RDF with an XQuery script guided by a configuration file. This RDF can then be queried using SPARQL via an XQuery front end to a Joseki server. The front end supports browsing the RDF graph and direct access to the resource RDF.

Conversion to RDF

An XQuery script generates the RDF. It uses a configuration file to define how columns of a table should be mapped into RDF and the namespaces to be used. This mapping needs a little more work to allow composite keys and allow user defined transformations. An interactive tool to create this map would be useful.

Configuration file

Here is the configuration file for the emp-dept data.

<?xml version="1.0" encoding="UTF-8"?>
<XML-to-RDF>
    <namespaces>
        <namespace prefix="f" uri="http://www.cems.uwe.ac.uk/empdept/concept/" declare="yes"/>
        <namespace prefix="ft" uri="http://www.cems.uwe.ac.uk/empdept/" declare="yes"/>
        <namespace prefix="rdf" uri="http://www.w3.org/1999/02/22-rdf-syntax-ns#" declare="no"/>
        <namespace prefix="rdfs" uri="http://www.w3.org/2000/01/rdf-schema#" declare="no"/>
        <namespace prefix="foaf" uri="http://xmlns.com/foaf/0.1/" declare="yes"/>
        <namespace prefix="xs" uri="http://www.w3.org/2001/XMLSchema#" declare="no"/>
    </namespaces>
    <map type="emp"  prefix="f">
        <source file="/db/Wiki/empdept/emp.xml" path="//Emp"/>
        <col name="EmpNo" pk="true" uribase="ft:emp" type="xs:string"/>
        <col name="Sal" type="xs:integer"/>
        <col name="Comm" type="xs:integer"/>
        <col name="HireDate" type="xs:date"/>
        <col name="MgrNo" tag="Mgr" uribase="ft:emp"/>
        <col name="DeptNo" tag="Dept" uribase="ft:dept"/>
        <col name="Ename" prefix="foaf" tag="surname"/>
        <col name="Job"/>
    </map>
    <map type="dept" prefix="f">
        <source file="/db/Wiki/empdept/dept.xml" path="//Dept"/>
        <col name="Dname"/>
        <col name="Location" uribase="http://dbpedia.org/resource"/>
        <col name="DeptNo" pk="true" uribase="ft:dept" type="xs:string"/>
    </map>
    <map type="salgrade"  prefix="f">
        <source file="/db/Wiki/empdept/salgrade.xml" path="//SalGrade"/>
        <col name="HiSal" type="xs:integer"/>
        <col name="LoSal" type="xs:integer"/>
        <col name="Grade" pk="true" uribase="ft:grade" type="xs:integer"/>
    </map>
</XML-to-RDF>

Data base conversion functions

One function row-to-rdf generates the RDF for a row of a table, another function map-to-schema generates RDFS descriptions of the predicates used in as table.

module namespace  fr= "http://www.cems.uwe.ac.uk/wiki/fr";
import module namespace util = "http://exist-db.org/xquery/util";

declare namespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
declare namespace rdfs = "http://www.w3.org/2000/01/rdf-schema#";

declare function fr:declare-namespaces($config) { 
  for $ns in $config//namespace[@declare="yes"]
  return util:declare-namespace($ns/@prefix,xs:anyURI($ns/@uri))
};

declare function fr:expand($qname as xs:string?, $map ) as xs:string ?{
   let $namespace := $map/..//namespace
   return
   if ($qname)
   then  if (contains($qname,":"))
             then  let $qs := tokenize($qname,":")
                       let $prefix := $qs[1]
                       let $name := $qs[2]
                       let $uri := $namespace[@prefix=$prefix]/@uri
                       return concat($uri,$name)
              else if ($namespace[@prefix = $qname])
                       then 
                         $namespace[@prefix = $qname]/@uri
              else 
                   $qname
    else ()
};


declare function fr:row-to-rdf($row as element() , $map as element() ) as element(rdf:Description) * {
       let $pk := $map/col[@pk="true"]
       let $pkv :=  string($row/*[name()=$pk/@name])
       let $pkuri := fr:expand($pk/@uribase, $map)
       return
         <rdf:Description>
            {attribute rdf:about {concat($pkuri,"/",$pkv)}}    
            { if ($map/@type)
              then 
                       let $typeuri := fr:expand(concat($map/@prefix,":",$map/@type),$map)
                       return <rdf:type rdf:resource="{$typeuri}"/>
              else ()
            }
            {for $col  in $map/col
             let $name := $col/@name
             let $data := string($row/*[name(.)=$name])  
              return 
              if ($data !="")
              then 
                   element { concat(($col/@prefix,$map/@prefix)[1], ":", ($col/@tag,$name)[1])}
                   {    
                           if ($col/@type)
                           then (attribute rdf:datatype
                                   {  fr:expand($col/@type,$map)} ,
                                   $data)
                            else if ( $col/@uribase )
                            then  attribute rdf:resource
                                  {concat(fr:expand($col/@uribase,$map), "/",replace($data," ","_"))}
                            else  $data
                    }
              else ()           
            } 
         </rdf:Description>
 };
 
 declare function fr:map-to-schema ($map as element()) as element(rdf:Description) * {
     let $typeuri := fr:expand(concat($map/@prefix,":",$map/@type),$map)
     for $col in $map/col[@type]
     let $prop := concat( fr:expand(($col/@prefix,$map/@prefix)[1],$map ), ($col/@tag,$col/@name)[1])
     let $rangeuri := ( fr:expand($col/@type,$map), fr:expand($col/@uribase,$map),"http://www.w3.org/2000/01/rdf-schema#literal")[1]
     return
       <rdf:Description rdf:about="{$prop}">
           <rdf:type  rdf:resource="http://www.w3.org/1999/02/22-rdf-syntax-ns#Property"/>
           <rdfs:domain rdf:resource="{$typeuri}"/>
           <rdfs:range rdf:resource="{$rangeuri}"/>
           <rdf:label>{string($col/@name)}</rdf:label>
       </rdf:Description>
 };

Full database conversion

The script to generate the RDF for the full database:

import module namespace fr="http://www.cems.uwe.ac.uk/wiki/fr"   at  "fr.xqm";
declare namespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
declare namespace rdfs = "http://www.w3.org/2000/01/rdf-schema#";

declare variable $config :=  doc(request:get-parameter("config",()));
declare variable $x := fr:declare-namespaces($config);

<rdf:RDF>
{
  for $map in $config//map
  let $xml := doc($map/source/@file)
  let $source := util:eval(concat("$xml",$map/source/@path))
  return 
        (for $row in $source  return fr:row-to-rdf($row,$map),
         fr:map-to-schema($map)
        )
}
</rdf:RDF>

Links

Get RDF Validate RDF and generate Triples and Graph

Resource RDF

In addition each resource is retrieved as RDF. In this simple example, the request for a resource URI like:

http://www.cems.uwe.ac.uk/empdept/dept/20

is re-written by Apache to

http://www.cems.uwe.ac.uk/xmlwiki/RDF/empdeptrdf.xq?dept=20

This mechanism does not conform to the recommended practice of distinguishing between information resources (such as the information about employee 7369) and the real world entity being represented. At present, the resource URI de-references directly to the RDF,rather than to indirect using the 303 mechanism recommended.

import module namespace fr="http://www.cems.uwe.ac.uk/wiki/fr"   at  "fr.xqm";
declare namespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
declare namespace rdfs = "http://www.w3.org/2000/01/rdf-schema#";

declare variable $config := doc("/db/Wiki/RDF/empdeptconfig.xml");
declare variable $x := fr:declare-namespaces($config);

let $param := request:get-parameter-names()
let $type := $param[1]
let $key := request:get-parameter($type,())
let $map := $config//map[@type = $type]
return
<rdf:RDF>
{ if ($map)
  then 
       let $xml := doc($map/source/@file)
       let $source := util:eval(concat("$xml",$map/source/@path))
       let $pkn := $map/col[@pk="true"]/@name
       let $path := concat("$source[",$pkn,"= $key]" )
       let $row := util:eval($path)
       return
          if ($row)
          then fr:row-to-rdf($row,$map)
       else
           concat("Value ", $key, " of Type ",$type," is not found")
  else
         concat("Type ",$type," is not recognised")
         
}
 </rdf:RDF>

Issues in mapping to RDF

The main guide to publishing data on the web is [1]. This work is progressively applying the principles which are enunciated there.

This conversion illustrates a few of the differences between local datasets,whether SQL or XML and a dataset designed to fit into a global database. Some decision remain unclear.

tables are implicitly within an organisational context.This context has to be added in RDF by creating a namespace for the local properties and identifiers
the scope of queries is implicitly within organisational boundaries, but in RDF this scope needs to be explicit. In the SQL query select * from emp; emp is ambiguously either the class of employees or the set of employees in the company. In RDF this needs to be explicit, so that two kind of tuples need to be added:
- tuples to type employees to a company definition of employee
- tuples to relate the employee to the company. (to be added)
linkage to the global database requires two kinds of links:
- local properties need to be mapped to global predicates. Here the employee name is mapped to foaf:surname (but the case probably needs changing). Alternatively a local predicete f:Location could be defined, which is equated to the foaf predicate with owl:savePropertyAs
- local identifiers of resources to be replaced by global uris. Here location is mapped to a dbpedia resource URI. Alternative the local UI f:/location/Dallas could be equated to the dbPedia resource with owl:sameAs.
foreign keys are replaced by full URIs.
primary keys are also replaced by URIs but the local primary key value, for example the employee number, will need to be retained as a literal if it is not purely a surrogate key. This perhaps should be mapped to rdf:label
datatypes are preferably explicit in the data to avoid conversion in queries although this increase the size of the rdf graph.
namespaces have been expanded in full where they occur in RDF attribute values. An alternative would be to define entities in an DTD prolog as shorthand for these namespaces but not all processors of the RDF would do the expansion?

[The choices made here are those of a novice and review would be welcome. ]

Some issues not yet addressed:

meta-data about the dataset as a whole - its origin, when and how converted, - these can be DC properties of a document, with each entity tied to that document as a part?
an alternative approach would be to start with a ontology and add mapping information to it rather than generating it from the ad-hoc configuration file.

Generic SPARQL Query

The following script provides, via a Joseki server at UWE, a query interface to this RDF. Literal language and datatype are ignored in this representation. URIs link to the browse query and also directly to the resource.

Two additional functions execute a SPARQL query and convert the SPARQL XML Query result to a table.

 declare function fr:execute-sparql($query as xs:string) {
  let $sparql := concat("http://dbpedia.org/sparql?format=xml&amp;default-graph-uri=http://dbpedia.org&amp;query=",escape-uri($query,true())  )
  return doc($sparql)
};

declare function fr:sparql-to-tuples($rdfxml ) {
   for $result in $rdfxml//r:result
   return
     <tuple>
            { for $binding  in $result/r:binding
               return                
                 if ($binding/r:uri)
                     then   element {$binding/@name}  {
                                    attribute type  {"uri"} , 
                                    string($binding/r:uri) 
                                }
                     else   element {$binding/@name}  {
                                    attribute type {$binding/@datatype}, 
                                    string($binding/r:literal)
                               }
             }
      </tuple>
 };

The SPARQL interface uses the configuration file to declare the namespaces.

import module namespace fr="http://www.cems.uwe.ac.uk/wiki/fr"   at  "fr.xqm";
declare namespace rdf = "http://www.w3.org/1999/02/22-rdf-syntax-ns#";
declare namespace rdfs = "http://www.w3.org/2000/01/rdf-schema#";

declare option exist:serialize "method=xhtml media-type=text/html omit-xml-declaration=no indent=yes 
        doctype-public=-//W3C//DTD&#160;XHTML&#160;1.0&#160;Transitional//EN
        doctype-system=http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd";
        
declare variable $config-file := request:get-parameter("config", "/db/Wiki/RDF/empdeptconfig.xml");
declare variable $config := doc($config-file);
declare variable $graph :=  concat("http://www.cems.uwe.ac.uk/xmlwiki/RDF/xml2rdf.xq?config=",$config-file);
declare variable $default-engine := "http://www.cems.uwe.ac.uk/joseki/sparql";
declare variable $script-name :=   tokenize(request:get-uri(),'/')[last()];
declare variable $default-prolog :=
"PREFIX fn: <http://www.w3.org/2005/xpath-functions#>
PREFIX afn: <http://jena.hpl.hp.com/ARQ/functions#>
";

declare variable $browse := "select ?s ?p ?o 
where {
      {<uri> ?p ?o }
      UNION
      {?s ?p <uri>}
      UNION
      {?s <uri> ?o}
   }";
   
let $config-prolog := fr:sparql-prefixes($config)
let $query := request:get-parameter ("query",())
let $uri := request:get-parameter("uri",())   
let $engine := request:get-parameter("engine",$default-engine)
let $query := 
   if ($uri)
   then  replace($browse,"uri",$uri)
   else $query
   
let $queryx := concat($default-prolog,$config-prolog,$query)
let $sparql := concat($engine,
                      "?default-graph-uri=",$graph,
                      "&amp;query=",encode-for-uri($queryx) 
                     )
let $result  :=
       if ($query !="")
       then
          fr:sparql-to-table(doc($sparql), $script-name) 
       else ()
return
<html xmlns="http://www.w3.org/1999/xhtml">
   <head> 
      <title>Emp-dept Query</title>
   </head>
   <body>
      <h1>Emp-dept Query</h1>
      <form action="{$script-name}">
          <textarea name="query" rows ="8" cols="90">
             {$query}
        </textarea>
        <br/>
        <input type="submit"/>
     </form>
     <h2>Result</h2>
         {$result}
    </body>
</html>

Query

The interface expands a query like

  select ?name ?job where {
   ?emp rdf:type  f:emp.
   ?emp foaf:surname ?name.
   ?emp f:Job ?job.
  }

into:

 prefix foaf: <http://xmlns.com/foaf/0.1/>
 prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
 prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#>
 prefix f: <http://www.cems.uwe.ac.uk/xmlwiki/empdept/concept/>
 prefix xs: <http://www.w3.org/2001/XMLSchema#>
 select ?name ?job 
 from <http://www.cems.uwe.ac.uk/xmlwiki/RDF/xml2rdf.xq?config=/db/Wiki/RDF/empdeptconfig.xml">
 where {
   ?emp rdf:type  f:emp.
   ?emp foaf:surname ?name.
   ?emp f:Job ?job.
 }

and sends this to the Joseki service. The graph to query is actually passed as the default graph rather than in the from clause.

Example Queries

This set of SPARQL queries needs further expansion and organisation and addition to the XQuery/SQL comparision.

Use these by copying and pasting into the query interface.

List all employees

 select ?emp where {
  ?emp  rdf:type  f:emp.
 }

List the names of all employees in alphabetical order

 select ?name where {
  ?emp  rdf:type  f:emp.
  ?emp  foaf:surname ?name. 
 }
 ORDER BY ?name

List the employees name, salary, department no and job

 select ?name ?sal ?dno ?job where {
  ?emp  rdf:type  f:emp;
        foaf:surname ?name;
        f:Sal ?sal;
        f:Dept ?dept;
        f:Job ?job.  
  ?dept f:DeptNo ?dno. 
 }

Note that ; in place of . repeats the subject.

List the first 5 employees

select ?ename where {
  ?emp  rdf:type  f:emp;
        foaf:surname ?ename.
} 
ORDER BY ?ename
LIMIT 5

List the top 5 employees by salary

select ?ename ?sal where {
 ?emp  rdf:type  f:emp;
       foaf:surname ?ename;
       f:Sal ?sal.
} 
ORDER BY DESC(?sal)
LIMIT 5

List the departments

select ?dept where {
  ?dept  rdf:type  f:dept.  
}

List all departments and all employees

select ?dept ?emp where {
  {?dept  rdf:type  f:dept }
  UNION
  {?emp rdf:type f:emp}
}

Count the number of departments

select count(?dept) where {
  ?dept  rdf:type  f:dept.

}

List the employees with salaries over 1000

If the RDF literal is typed, for example as xs:integer as is the case with this generated RDF, then the following query will select employees with a salary greater than 1000:

select ?emp ?sal where {
  ?emp  rdf:type  f:emp;
        f:Sal ?sal.
  FILTER (?sal > 1000)
}

If the RDF literal is not typed, then the variable must be cast:

 select ?emp ?sal where {
  ?emp  rdf:type  f:emp;
        f:Sal ?sal.
  FILTER (xs:integer(?sal) > 1000)
}

List employees and their locations

select ?emp ?loc where {
  ?emp  rdf:type  f:emp.
  ?emp f:Dept ?dept.
  ?dept f:Location ?loc.
}

List the names of employees and their managers

select ?ename ?mname where {
  ?emp  rdf:type  f:emp;
        f:Mgr ?mgr;
        foaf:surname ?ename.
  ?mgr foaf:surname ?mname.
}

Include employees with no manager:

select ?ename ?mname where {
  ?emp  rdf:type  f:emp;
        foaf:surname ?ename.
  OPTIONAL {?emp f:Mgr ?mgr.
            ?mgr foaf:surname ?mname.
           }
}

List employees with no manager:
 
select ?ename where {
  ?emp  rdf:type  f:emp;
        foaf:surname ?ename.
  OPTIONAL {?emp f:Mgr ?mgr}
  FILTER (!bound(?mgr))
}

List the distinct locations of staff

select distinct ?loc  where {
  ?emp  rdf:type  f:emp.
  ?emp  f:Dept ?dept.
  ?dept f:Location ?loc.
}

List details of the employees who are ANALYSTs

select * where {
  ?emp  rdf:type  f:emp.
  ?emp  f:Dept ?dept.
  ?dept f:Location ?loc.
  ?emp f:Job ?job.
  FILTER (?job = "ANALYST")
}

List employees who are either ANALYSTs or MANAGERs

select ?emp where {
  ?emp  rdf:type  f:emp;
        f:Job ?job.
  FILTER (?job = "ANALYST"  || ?job = "MANAGER")
}

List employees who are neither ANALYSTs nor MANAGERs

select * where {
   ?emp  rdf:type  f:emp;
         f:Job ?job.
   FILTER (?job != "ANALYST"  && ?job != "MANAGER")
}

List employees whose surname begins with "S"

select * where {
  ?emp  rdf:type  f:emp.
  ?emp foaf:surname ?ename.
  FILTER (regex(?ename, "^S"))
}

List employees whose surname contains "AR"

select * where {
  ?emp  rdf:type  f:emp.
  ?emp foaf:surname ?ename.
  FILTER (regex(?ename, "AR"))
}

List employees whose surname contains M followed by R ignoring case

select * where {
  ?emp  rdf:type  f:emp.
  ?emp foaf:surname ?ename.
  FILTER (regex(?ename, "m.*r","i"))
}

Compute the maximum salary

SPARQL lacks min() or max(), although they are added to some implementations, the following recipe, due to Dean Allemang can be used:

select ?maxemp ?maxsal where {
 ?maxemp  rdf:type  f:emp.
 ?maxemp  f:Sal ?maxsal.
 OPTIONAL { ?emp rdf:type f:emp.
            ?emp f:Sal ?sal.
            FILTER ( ?sal > ?maxsal)
          }.
 FILTER (!bound (?sal))
}

How does this work? We seek a maximum salary of a maximum employee.

Get the department which SMITH works for

select ?dname  where {
  ?emp  rdf:type  f:emp.
  ?emp f:Dept ?dept.
  ?emp foaf:surname "SMITH".
  ?dept f:Dname ?dname.
}

List the names of employees in Accounting

select ?ename  where {
  ?emp  rdf:type  f:emp.
  ?emp f:Dept ?dept.
  ?emp foaf:surname ?ename.
  ?dept f:Dname "Accounting".
}

List all employee data

select ?prop ?val where {
  ?emp  rdf:type  f:emp.
  ?emp ?prop ?val.
}

List all data

select * where {
 ?s ?p ?o
}

Employees hired in this millennium

select ?ename ?hire where {
 ?emp  rdf:type  f:emp.
 ?emp f:HireDate ?hire.
 ?emp foaf:surname ?ename.
 FILTER (?hire > "2000-01-01"^^xs:date) 
}

Note that the literal needs to be typed to make this comparison work.

List the names of employees whose manager is in a different department

select ?name ?edname ?mdname {
  ?emp  rdf:type  f:emp;
        foaf:surname ?name;
        f:Dept ?dept;
        f:Mgr ?mgr.
  
   ?mgr f:Dept ?mdept. 
   ?dept f:Dname ?edname.
   ?mdept f:Dname ?mdname.
   FILTER (?dept != ?mdept)
}

List the grades of employees

In relational terms, this is a theta-join between the employee and the salgrade tables:

select ?ename ?grade where {
 ?emp  rdf:type  f:emp;
       foaf:surname ?ename;
       f:Sal ?sal.
 ?salgrade rdf:type f:salgrade; 
       f:LoSal ?low; 
       f:HiSal ?high;
       f:Grade ?grade.
   
 FILTER (?sal >= ?low && ?sal <= ?high)
 }

Abbreviated query forms

A new prefix simplifies referencing individual resources by their URI

 prefix e: <http://www.cems.uwe.ac.uk/xmlwiki/empdept/emp/>
 select ?sal  where {
    e:7900 f:Sal ?sal.
 }

is short for

 select ?sal  where {
    <http://www.cems.uwe.ac.uk/xmlwiki/empdept/emp/7900> f:Sal ?sal.
 }

We could also introduce a default namespace:

prefix : <http://www.cems.uwe.ac.uk/empdept/concept/>
select ?name ?sal ?dno ?job where {
 ?emp  rdf:type  :emp;
       foaf:surname ?name;
       :Sal ?sal;
       :Dept ?dept;
       :Job ?job.  
 ?dept :DeptNo ?dno. 
}

and use the abbreviation a for rdf:type:

prefix : <http://www.cems.uwe.ac.uk/empdept/concept/>
select ?name ?sal ?dno ?job where {
 ?emp  a :emp;
       foaf:surname ?name;
       :Sal ?sal;
       :Dept ?dept;
       :Job ?job.  
 ?dept :DeptNo ?dno. 
}

and if we don't need to return the resource itself, it can be anonymous

prefix : <http://www.cems.uwe.ac.uk/empdept/concept/>
select ?name ?sal ?dno ?job where {
 [ a :emp;
   foaf:surname ?name;
   :Sal ?sal;
   :Dept ?dept;
   :Job ?job
 ].
 ?dept :DeptNo ?dno. 
}

Schema queries

The presence of schema data in the same RDF graph enable schema queries about resources to be made.

List all predicates for an employee

select distinct ?prop where {
  ?emp  rdf:type  f:emp.
  ?emp ?prop ?val.
}

What integer properties do employees have?

select ?prop  where {
   ?emp a f:emp. 
   ?prop rdfs:domain ?type.
   ?prop rdfs:range xs:integer.
}

What literal properties do MANAGERS have?

select DISTINCT ?prop  where {
   ?x f:Job "MANAGER".
   ?x a ?type. 
   ?prop rdfs:domain ?type.
   ?prop rdfs:range rdfs:literal.
}

What types of resources have salaries?

select ?type where {
  f:Sal rdfs:domain ?type.
}