<?xml version="1.0" encoding="UTF-8" ?>
<!--RDF based XML document generated By OpenLink Virtuoso-->
<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
 <rss:channel xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/dav/dav-blog-1/">
  <rss:title>OpenLink Community Blog</rss:title>
  <rss:link>http://www.openlinksw.com/weblog/dav/dav-blog-1/</rss:link>
  <rss:description>A Collection of blogs by OpenLink Staff</rss:description>
  <dc:creator xmlns:dc="http://purl.org/dc/elements/1.1/">kidehen@openlinksw.com</dc:creator>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-11-23T11:44:05Z</dc:date>
  <rss:items>
   <rdf:Seq>
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-11-18#1590" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2009-09-01#1573" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2009-09-01#1572" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-07-23#1565" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-04-22#1542" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2009-04-01#1541" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2009-04-01#1540" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2009-03-25#1538" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2009-03-25#1537" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-03-14#1531" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-27#1520" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-24#1519" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-09#1517" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-12-16#1499" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-12-16#1498" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-12-11#1495" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-12-11#1494" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-11-28#1489" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-11-27#1488" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-11-27#1487" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-10-24#1460" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-10-24#1459" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-10-01#1447" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-09-30#1446" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-09-30#1445" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-09-11#1437" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-08-15#1413" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-06-09#1381" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-06-09#1376" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-05-20#1364" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-05-16#1362" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2008-05-09#1359" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2008-05-09#1358" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-30#1352" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-15#1341" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-09#1333" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-09#1332" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-03-12#1323" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-02-08#1314" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-31#1306" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-17#1300" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-15#1295" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-04#1288" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2007-11-21#1272" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2007-11-08#1269" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-09-22#1261" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-09-03#1249" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-06-14#1224" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-05-25#1203" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-05-25#1202" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-22#1165" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-09#1157" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-01#1148" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-02-24#1143" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-01-22#1123" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-10-18#1064" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-09-07#1036" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-08-28#1030" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/vdb/blog/?date=2006-08-10#1025" />
      <rdf:li rdf:resource="http://www.openlinksw.com/weblog/oerling/?date=2006-08-10#1024" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-07-04#995" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-05-11#973" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-04-05#947" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-11-14#902" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-10-26#882" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-04-26#810" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-03-26#766" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2004-08-26#611" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-10-02#383" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-09-25#373" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-22#245" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-22#244" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-21#241" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-05#231" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-05-31#347" />
      <rdf:li rdf:resource="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-05-31#78" />
   </rdf:Seq>
  </rss:items>
 </rss:channel>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-11-18#1590">
  <rss:title>5 Game Changing Things about the OpenLink Virtuoso + AWS Cloud Combo</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-11-18T19:12:56Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Here are 5 powerful benefits you can immediately derive from the combination of Virtuoso and Amazon&#39;s AWS services (specifically the EC2 and EBS components): Acquire your own personal or service specific data space in the Cloud. Think DBase, Paradox, FoxPRO, Access of yore, but with the power of Oracle, Informix, Microsoft SQL Server etc.. using a Conceptual, as opposed to solely Logical, model based DBMS (i.e., a Hybrid DBMS Engine for: SQL, RDF, XML, and Full Text) Ability to share and control access to your resources using innovations like FOAF+SSL, OpenID, and OAuth, all from one place Construction of personal or organization based FOAF profiles in a matter of minutes; by simply creating a basic DBMS (or ODS application layer) account; and then using this profile to create strong links (references) to all your Data silos (esp. those from the Web 2.0 realm) Load data sets from the LOD cloud or Sponge existing Web resources (i.e., on the fly data transformation to RDF model based Linked Data) and then use the combination to build powerful lookup services that enrich the value of URLs (think: Web addressable reports holding query results) that you publish Bind all of the above to a domain that you own (e.g. a .Name domain) so that you have an attribution-friendly &quot;authority&quot; component for resource URLs and Entity URIs published from your Personal Linked Data Space on the Web (or private HTTP network). In a nutshell, the AWS Cloud infrastructure simplifies the process of generating Federated presence on the Internet and/or World Wide Web. Remember, centralized networking models always end up creating data silos, in some context, ultimately! :-)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
Here are 5 powerful benefits you can immediately derive from the combination of <a href="http://virtuoso.openlinksw.com" id="link-id17eb8988">Virtuoso</a> and Amazon&#39;s AWS services (specifically the EC2 and EBS components):</p>

<ol>
<li>
Acquire your own personal or service specific <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id1423e520">data space</a> in the Cloud. Think DBase, Paradox, FoxPRO, Access of yore, but with the power of <a href="http://dbpedia.org/resource/Oracle_Database" id="link-id136c6290">Oracle</a>, <a href="http://dbpedia.org/resource/IBM_Informix" id="link-id11b269b8">Informix</a>, <a href="http://dbpedia.org/resource/Microsoft_SQL_Server" id="link-id138084b8">Microsoft SQL Server</a> etc.. using a Conceptual, as opposed to solely Logical, model based DBMS (i.e., a Hybrid DBMS Engine for: <a href="http://dbpedia.org/resource/SQL" id="link-id132a7938">SQL</a>, RDF, XML, and Full Text)
</li>
<li>
Ability to share and control access to your resources using innovations like <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id17ee9d28">FOAF</a>+SSL, OpenID, and OAuth, all from one place
</li>
<li>
Construction of personal or organization based FOAF profiles in a matter of minutes; by simply creating a basic DBMS (or <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id14784ae0">ODS</a> application layer) account; and then using this profile to create strong links (references) to all your Data silos (esp. those from the <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> 2.0 realm)
</li>
<li>
Load data sets from the <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id17e6ac98">LOD</a> cloud or Sponge existing Web resources (i.e., on the fly data transformation to RDF model based <a href="http://dbpedia.org/resource/Linked_Data" id="link-id17e65d38">Linked Data</a>) and then use the combination to build powerful lookup services that enrich the value of URLs (think: Web addressable reports holding query results) that you publish
</li>
<li>
Bind all of the above to a domain that you own (e.g. a .Name domain) so that you have an attribution-friendly &quot;authority&quot; component for resource URLs and <a href="http://dbpedia.org/resource/Entity" id="link-id118a08d8">Entity</a> URIs published from your Personal Linked Data Space on the Web (or private HTTP network).
</li>
</ol>
<p>
In a nutshell, the AWS Cloud infrastructure simplifies the process of generating Federated presence on the <a href="http://dbpedia.org/resource/Internet" id="link-id1380af38">Internet</a> and/or <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id11633b10">World Wide Web</a>. Remember, centralized networking models always end up creating data silos, in some <a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id142006f0">context</a>, ultimately! :-)
</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2009-09-01#1573">
  <rss:title>Provenance and Reification in Virtuoso</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-09-01T14:44:08Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">These days, data provenance is a big topic across the board, ranging from the linked data web, to RDF in general, to any kind of data integration, with or without RDF. Especially with scientific data we encounter the need for metadata and provenance, repeatability of experiments, etc. Data without context is worthless, yet the producers of said data do not always have a model or budget for metadata. And if they do, the approach is often a proprietary relational schema with web services in front. RDF and linked data principles could evidently be a great help. This is a large topic that goes into the culture of doing science and will deserve a more extensive treatment down the road. For now, I will talk about possible ways of dealing with provenance annotations in Virtuoso at a fairly technical level. If data comes many-triples-at-a-time from some source (e.g., library catalogue, user of a social network), then it is often easiest to put the data from each source/user into its own graph. Annotations can then be made on the graph. The graph IRI will simply occur as the subject of a triple in the same or some other graph. For example, all such annotations could go into a special annotations graph. On the query side, having lots of distinct graphs does not have to be a problem if the index scheme is the right one, i.e., the 4 index scheme discussed in the Virtuoso documentation. If the query does not specify a graph, then triples in any graph will be considered when evaluating the query. One could write queries like â SELECT ?pub WHERE { GRAPH ?g { ?person foaf:knows ?contact } ?contact foaf:name &quot;Alice&quot; . ?g xx:has_publisher ?pub } This would return the publishers of graphs that assert that somebody knows Alice. Of course, the RDF reification vocabulary can be used as-is to say things about single triples. It is however very inefficient and is not supported by any specific optimization. Further, reification does not seem to get used very much; thus there is no great pressure to specially optimize it. If we have to say things about specific triples and this occurs frequently (i.e., for more than 10% or so of the triples), then modifying the quad table becomes an option. For all its inefficiency, the RDF reification vocabulary is applicable if reification is a rarity. Virtuoso&#39;s RDF_QUAD table can be altered to have more columns. The problem with this is that space usage is increased and the RDF loading and query functions will not know about the columns. A SQL update statement can be used to set values for these additional columns if one knows the G,S,P,O. Suppose we annotated each quad with the user who inserted it and a timestamp. These would be columns in the RDF_QUAD table. The next choice would be whether these were primary key parts or dependent parts. If primary key parts, these would be non-NULL and would occur on every index. The same quad would exist for each distinct user and time this quad had been inserted. For loading functions to work, these columns would need a default. In practice, we think that having such metadata as a dependent part is more likely, so that G,S,P,O are the unique identifier of the quad. Whether one would then include these columns on indices other than the primary key would depend on how frequently they were accessed. In SPARQL, one could use an extension syntax like â SELECT * WHERE { ?person foaf:knows ?connection OPTION ( time ?ts ) . ?connection foaf:name &quot;Alice&quot; . FILTER ( ?ts &gt; &quot;2009-08-08&quot;^^xsd:datetime ) } This would return everybody who knows Alice since a date more recent than 2009-08-08. This presupposes that the quad table has been extended with a datetime column. The OPTION (time ?ts) syntax is not presently supported but we can easily add something of the sort if there is user demand for it. In practice, this would be an extension mechanism enabling one to access extension columns of RDF_QUAD via a column ?variable syntax in the OPTION clause. If quad metadata were not for every quad but still relatively frequent, another possibility would be making a separate table with a key of GSPO and a dependent part of R, where R would be the reification URI of the quad. Reification statements would then be made with R as a subject. This would be more compact than the reification vocabulary and would not modify the RDF_QUAD table. The syntax for referring to this could be something like â SELECT * WHERE { ?person foaf:knows ?contact OPTION ( reify ?r ) . ?r xx:assertion_time ?ts . ?contact foaf:name &quot;Alice&quot; . FILTER ( ?ts &gt; &quot;2008-8-8&quot;^^xsd:datetime ) } We could even recognize the reification vocabulary and convert it into the reify option if this were really necessary. But since it is so unwieldy I don&#39;t think there would be huge demand. Who knows? You tell us.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>These days, <a href="http://dbpedia.org/resource/Data" id="link-id0x37019c8">data</a> provenance is a big topic across the board, ranging from the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x53c3620">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x4aa3848">web</a>, to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x385aff0">RDF</a> in general, to any kind of data integration, with or without RDF.  Especially with scientific data we encounter the need for metadata and provenance, repeatability of experiments, etc.  Data without context is worthless, yet the producers of said data do not always have a model or budget for metadata.  And if they do, the approach is often a proprietary relational schema with web services in front.</p>

<p>RDF and linked data principles could evidently be a great help.  This is a large topic that goes into the culture of doing science and will deserve a more extensive treatment down the road.</p>

<p>For now, I will talk about possible ways of dealing with provenance annotations in <a href="http://virtuoso.openlinksw.com" id="link-id0x51c4da0">Virtuoso</a> at a fairly technical level.</p>

<p>If data comes many-triples-at-a-time from some source (e.g., library catalogue, user of a social network), then it is often easiest to put the data from each source/user into its own graph.  Annotations can then be made on the graph.  The graph IRI will simply occur as the subject of a triple in the same or some other graph.  For example, all such annotations could go into a special annotations graph.</p>

<p>On the query side, having lots of distinct graphs does not have to be a problem if the index scheme is the right one, i.e., the 4 index scheme <a href="http://docs.openlinksw.com/virtuoso/rdfperformancetuning.html#rdfperfindexes" id="link-id142a0798">discussed in the Virtuoso documentation</a>.  If the query does not specify a graph, then triples in any graph will be considered when evaluating the query.</p>


<p>One could write queries like â</p>

<blockquote>
 <code><pre>SELECT  ?pub 
  WHERE 
    { 
      GRAPH  ?g 
        { 
          ?person  foaf:knows  ?contact 
        } 
      ?contact  foaf:name         &quot;Alice&quot;  . 
      ?g        xx:has_publisher  ?pub 
    }</pre>
 </code>
</blockquote>

<p>This would return the publishers of graphs that assert that somebody knows Alice.</p>

<p>Of course, the <a href="http://www.w3.org/TR/2004/REC-rdf-primer-20040210/#reification" id="link-id14fa9488">RDF reification vocabulary</a> can be used as-is to say things about single triples.  It is however very inefficient and is not supported by any specific optimization.  Further, reification does not seem to get used very much; thus there is no great pressure to specially optimize it.</p>

<p>If we have to say things about specific triples and this occurs frequently (i.e., for more than 10% or so of the triples), then modifying the quad table becomes an option. For all its inefficiency, the RDF reification vocabulary is applicable if reification is a rarity.</p>

<p>Virtuoso&#39;s <code>RDF_QUAD</code> table can be altered to have more columns.  The problem with this is that space usage is increased and the RDF loading and query functions will not know about the columns.  A <a href="http://dbpedia.org/resource/SQL" id="link-id0x4784bf0">SQL</a> update statement can be used to set values for these additional columns if one knows the <code>G,S,P,O</code>. </p>

<p>Suppose we annotated each quad with the user who inserted it and a timestamp.  These would be columns in the <code>RDF_QUAD</code> table.  The next choice would be whether these were primary key parts or dependent parts.  If primary key parts, these would be non-<code>NULL</code> and would occur on every index.  The same quad would exist for each distinct user and time this quad had been inserted.  For loading functions to work, these columns would need a default.  In practice, we think that having such metadata as a dependent part is more likely, so that <code>G,S,P,O</code> are the unique identifier of the quad.  Whether one would then include these columns on indices other than the primary key would depend on how frequently they were accessed.</p>

<p>In <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x4a8a7c0">SPARQL</a>, one could use an extension syntax like â</p>

<blockquote>
 <code><pre>SELECT  * 
  WHERE 
    { ?person      foaf:knows  ?connection 
                   OPTION ( time  ?ts )     . 
      ?connection  foaf:name   &quot;Alice&quot;      . 
      FILTER ( ?ts &gt; &quot;2009-08-08&quot;^^xsd:datetime ) 
    }</pre>
 </code>
</blockquote>

<p>This would return everybody who knows Alice since a date more recent than 2009-08-08.  This presupposes that the quad table has been extended with a datetime column.</p>

<p>The <code>OPTION (time ?ts)</code> syntax is not presently supported but we can easily add something of the sort if there is user demand for it. In practice, this would be an extension mechanism enabling one to access extension columns of <code>RDF_QUAD</code> via a column <code>?variable</code> syntax in the <code>OPTION</code> clause.</p>


<p>If quad metadata were not for every quad but still relatively frequent, another possibility would be making a separate table with a key of <code>GSPO</code> and a dependent part of <code>R</code>, where <code>R</code> would be the reification <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0x49e6108">URI</a> of the quad.  Reification statements would then be made with <code>R</code> as a subject.  This would be more compact than the reification vocabulary and would not modify the <code>RDF_QUAD</code> table.   The syntax for referring to this could be something like â</p>

<blockquote>
 <code><pre>SELECT * 
  WHERE 
    { ?person   foaf:knows         ?contact 
                OPTION ( reify  ?r )          . 
      ?r        xx:assertion_time  ?ts       . 
      ?contact  foaf:name          &quot;Alice&quot;   . 
      FILTER ( ?ts &gt; &quot;2008-8-8&quot;^^xsd:datetime ) 
    }</pre>
 </code>
</blockquote>

<p>We could even recognize the reification vocabulary and convert it into the reify option if this were really necessary.  But since it is so unwieldy I don&#39;t think there would be huge demand.  Who knows?  You tell us.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2009-09-01#1572">
  <rss:title>Provenance and Reification in Virtuoso</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-09-01T14:44:08Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">These days, data provenance is a big topic across the board, ranging from the linked data web, to RDF in general, to any kind of data integration, with or without RDF. Especially with scientific data we encounter the need for metadata and provenance, repeatability of experiments, etc. Data without context is worthless, yet the producers of said data do not always have a model or budget for metadata. And if they do, the approach is often a proprietary relational schema with web services in front. RDF and linked data principles could evidently be a great help. This is a large topic that goes into the culture of doing science and will deserve a more extensive treatment down the road. For now, I will talk about possible ways of dealing with provenance annotations in Virtuoso at a fairly technical level. If data comes many-triples-at-a-time from some source (e.g., library catalogue, user of a social network), then it is often easiest to put the data from each source/user into its own graph. Annotations can then be made on the graph. The graph IRI will simply occur as the subject of a triple in the same or some other graph. For example, all such annotations could go into a special annotations graph. On the query side, having lots of distinct graphs does not have to be a problem if the index scheme is the right one, i.e., the 4 index scheme discussed in the Virtuoso documentation. If the query does not specify a graph, then triples in any graph will be considered when evaluating the query. One could write queries like â SELECT ?pub WHERE { GRAPH ?g { ?person foaf:knows ?contact } ?contact foaf:name &quot;Alice&quot; . ?g xx:has_publisher ?pub } This would return the publishers of graphs that assert that somebody knows Alice. Of course, the RDF reification vocabulary can be used as-is to say things about single triples. It is however very inefficient and is not supported by any specific optimization. Further, reification does not seem to get used very much; thus there is no great pressure to specially optimize it. If we have to say things about specific triples and this occurs frequently (i.e., for more than 10% or so of the triples), then modifying the quad table becomes an option. For all its inefficiency, the RDF reification vocabulary is applicable if reification is a rarity. Virtuoso&#39;s RDF_QUAD table can be altered to have more columns. The problem with this is that space usage is increased and the RDF loading and query functions will not know about the columns. A SQL update statement can be used to set values for these additional columns if one knows the G,S,P,O. Suppose we annotated each quad with the user who inserted it and a timestamp. These would be columns in the RDF_QUAD table. The next choice would be whether these were primary key parts or dependent parts. If primary key parts, these would be non-NULL and would occur on every index. The same quad would exist for each distinct user and time this quad had been inserted. For loading functions to work, these columns would need a default. In practice, we think that having such metadata as a dependent part is more likely, so that G,S,P,O are the unique identifier of the quad. Whether one would then include these columns on indices other than the primary key would depend on how frequently they were accessed. In SPARQL, one could use an extension syntax like â SELECT * WHERE { ?person foaf:knows ?connection OPTION ( time ?ts ) . ?connection foaf:name &quot;Alice&quot; . FILTER ( ?ts &gt; &quot;2009-08-08&quot;^^xsd:datetime ) } This would return everybody who knows Alice since a date more recent than 2009-08-08. This presupposes that the quad table has been extended with a datetime column. The OPTION (time ?ts) syntax is not presently supported but we can easily add something of the sort if there is user demand for it. In practice, this would be an extension mechanism enabling one to access extension columns of RDF_QUAD via a column ?variable syntax in the OPTION clause. If quad metadata were not for every quad but still relatively frequent, another possibility would be making a separate table with a key of GSPO and a dependent part of R, where R would be the reification URI of the quad. Reification statements would then be made with R as a subject. This would be more compact than the reification vocabulary and would not modify the RDF_QUAD table. The syntax for referring to this could be something like â SELECT * WHERE { ?person foaf:knows ?contact OPTION ( reify ?r ) . ?r xx:assertion_time ?ts . ?contact foaf:name &quot;Alice&quot; . FILTER ( ?ts &gt; &quot;2008-8-8&quot;^^xsd:datetime ) } We could even recognize the reification vocabulary and convert it into the reify option if this were really necessary. But since it is so unwieldy I don&#39;t think there would be huge demand. Who knows? You tell us.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>These days, <a href="http://dbpedia.org/resource/Data" id="link-id0x4a44870">data</a> provenance is a big topic across the board, ranging from the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x4e10e60">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x4738350">web</a>, to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1fe33310">RDF</a> in general, to any kind of data integration, with or without RDF.  Especially with scientific data we encounter the need for metadata and provenance, repeatability of experiments, etc.  Data without context is worthless, yet the producers of said data do not always have a model or budget for metadata.  And if they do, the approach is often a proprietary relational schema with web services in front.</p>

<p>RDF and linked data principles could evidently be a great help.  This is a large topic that goes into the culture of doing science and will deserve a more extensive treatment down the road.</p>

<p>For now, I will talk about possible ways of dealing with provenance annotations in <a href="http://virtuoso.openlinksw.com" id="link-id0x36581e8">Virtuoso</a> at a fairly technical level.</p>

<p>If data comes many-triples-at-a-time from some source (e.g., library catalogue, user of a social network), then it is often easiest to put the data from each source/user into its own graph.  Annotations can then be made on the graph.  The graph IRI will simply occur as the subject of a triple in the same or some other graph.  For example, all such annotations could go into a special annotations graph.</p>

<p>On the query side, having lots of distinct graphs does not have to be a problem if the index scheme is the right one, i.e., the 4 index scheme <a href="http://docs.openlinksw.com/virtuoso/rdfperformancetuning.html#rdfperfindexes" id="link-id142a0798">discussed in the Virtuoso documentation</a>.  If the query does not specify a graph, then triples in any graph will be considered when evaluating the query.</p>


<p>One could write queries like â</p>

<blockquote>
 <code><pre>SELECT  ?pub 
  WHERE 
    { 
      GRAPH  ?g 
        { 
          ?person  foaf:knows  ?contact 
        } 
      ?contact  foaf:name         &quot;Alice&quot;  . 
      ?g        xx:has_publisher  ?pub 
    }</pre>
 </code>
</blockquote>

<p>This would return the publishers of graphs that assert that somebody knows Alice.</p>

<p>Of course, the <a href="http://www.w3.org/TR/2004/REC-rdf-primer-20040210/#reification" id="link-id14fa9488">RDF reification vocabulary</a> can be used as-is to say things about single triples.  It is however very inefficient and is not supported by any specific optimization.  Further, reification does not seem to get used very much; thus there is no great pressure to specially optimize it.</p>

<p>If we have to say things about specific triples and this occurs frequently (i.e., for more than 10% or so of the triples), then modifying the quad table becomes an option. For all its inefficiency, the RDF reification vocabulary is applicable if reification is a rarity.</p>

<p>Virtuoso&#39;s <code>RDF_QUAD</code> table can be altered to have more columns.  The problem with this is that space usage is increased and the RDF loading and query functions will not know about the columns.  A <a href="http://dbpedia.org/resource/SQL" id="link-id0x4b1d938">SQL</a> update statement can be used to set values for these additional columns if one knows the <code>G,S,P,O</code>. </p>

<p>Suppose we annotated each quad with the user who inserted it and a timestamp.  These would be columns in the <code>RDF_QUAD</code> table.  The next choice would be whether these were primary key parts or dependent parts.  If primary key parts, these would be non-<code>NULL</code> and would occur on every index.  The same quad would exist for each distinct user and time this quad had been inserted.  For loading functions to work, these columns would need a default.  In practice, we think that having such metadata as a dependent part is more likely, so that <code>G,S,P,O</code> are the unique identifier of the quad.  Whether one would then include these columns on indices other than the primary key would depend on how frequently they were accessed.</p>

<p>In <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x472afb0">SPARQL</a>, one could use an extension syntax like â</p>

<blockquote>
 <code><pre>SELECT  * 
  WHERE 
    { ?person      foaf:knows  ?connection 
                   OPTION ( time  ?ts )     . 
      ?connection  foaf:name   &quot;Alice&quot;      . 
      FILTER ( ?ts &gt; &quot;2009-08-08&quot;^^xsd:datetime ) 
    }</pre>
 </code>
</blockquote>

<p>This would return everybody who knows Alice since a date more recent than 2009-08-08.  This presupposes that the quad table has been extended with a datetime column.</p>

<p>The <code>OPTION (time ?ts)</code> syntax is not presently supported but we can easily add something of the sort if there is user demand for it. In practice, this would be an extension mechanism enabling one to access extension columns of <code>RDF_QUAD</code> via a column <code>?variable</code> syntax in the <code>OPTION</code> clause.</p>


<p>If quad metadata were not for every quad but still relatively frequent, another possibility would be making a separate table with a key of <code>GSPO</code> and a dependent part of <code>R</code>, where <code>R</code> would be the reification <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0x365b190">URI</a> of the quad.  Reification statements would then be made with <code>R</code> as a subject.  This would be more compact than the reification vocabulary and would not modify the <code>RDF_QUAD</code> table.   The syntax for referring to this could be something like â</p>

<blockquote>
 <code><pre>SELECT * 
  WHERE 
    { ?person   foaf:knows         ?contact 
                OPTION ( reify  ?r )          . 
      ?r        xx:assertion_time  ?ts       . 
      ?contact  foaf:name          &quot;Alice&quot;   . 
      FILTER ( ?ts &gt; &quot;2008-8-8&quot;^^xsd:datetime ) 
    }</pre>
 </code>
</blockquote>

<p>We could even recognize the reification vocabulary and convert it into the reify option if this were really necessary.  But since it is so unwieldy I don&#39;t think there would be huge demand.  Who knows?  You tell us.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-07-23#1565">
  <rss:title>Exploring the Value Proposition of Linked Data</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-07-24T00:17:19Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">What is Linked Data? The primary topic of a meme penned by TimBL in the form of a Design Issues Doc (note: this is how TimBL has shared his thoughts since the Beginning of the Web). There are a number of dimensions to the meme, but its primary purpose is the reintroduction of the HTTP URI -- a vital component of the Web&#39;s core architecture. What&#39;s Special about HTTP URIs? They possess an intrinsic duality that combines persistent and unambiguous Data Identity with platform &amp; representation format independent Data Access. Thus, you can use a string of characters that look like a contemporary Web URL to unambiguously achieve the following: Identity or Name Anything of Interest Describe Anything of Interest by associating the Description Subject&#39;s Identity with a constellation of Attribute and Value pairs (technically: an Entity-Attribute-Value or Subject-Predicate-Object graph) Make the Description of Named Things of Interest discoverable on the Web by implicitly binding the aforementioned to Documents that hold their descriptions (technically: metadata documents or information resources) What&#39;s the basic value proposition of the Linked Data meme? Enabling more productive use of the Web by users and developers alike. All of which is achieved by tweaking the Web&#39;s Hyperlinking feature such that it now includes Hypertext and Hyperdata as link types. Note: Hyperdata Linking is simply what an HTTP URI facilitates. Examples problems solved by injecting Linked Data into the Web: Federated Identity by enabling Individuals to unambiguously Identify themselves (Profiles++) courtesy of existing Internet and Web protocols (e.g., FOAF+SSL&#39;s WebIDs which combine Personal Identity with X.509 certificates and HTTPs based client side certification) Security and Privacy challenge alleviation by delivering a mechanism for policy based data access that feeds off federated individual identity and social network (graph) traversal Spam Busting via the above. Increasing the Serendipitous Discovery Quotient (SDQ) of Web accessible resources by embedding Rich Metadata into (X)HTML Documents e.g., structured descriptions of your &quot;WishLists&quot; and &quot;OfferLists&quot; via a common set of terms offered by vocabularies such as GoodRelations and SIOC Coherent integration of disparate data across the Web and/or within the Enterprise via &quot;Data Meshing&quot; rather than &quot;Data Mashing&quot; Moving beyond imprecise statistically driven &quot;Keyword Search&quot; (e.g. Page Rank) to &quot;Precision Find&quot; driven by typed link based Entity Rank plus Entity Type and Entity Property filters. Conclusion If all of the above still falls into the technical mumbo-jumbo realm, then simply consider Linked Data as delivering Open Data Access in granular form to Web accessible data -- that goes beyond data containers (documents or files). The value proposition of Linked Data is inextricably linked to the value proposition of the World Wide Web. This is true, because the Linked Data meme is ultimately about an enhancement of the current Web; achieved by reintroducing its architectural essence -- in new context -- via a new level of link abstraction, courtesy of the Identity and Access duality of HTTP URIs. As a result of Linked Data, you can now have Links on the Web for a Person, Document, Music, Consumer Electronics, Products &amp; Services, Business Opening &amp; Closing Hours, Personal &quot;WishLists&quot; and &quot;OfferList&quot;, an Idea, etc.. in addition to links for Properties (Attributes &amp; Values) of the aforementioned. Ultimately, all of these links will be indexed in a myriad of ways providing the substrate for the next major period of Internet &amp; Web driven innovation, within our larger human-ingenuity driven innovation continuum. Related Recipes for Describing Your Business and its Offerings using the GoodRelations Vocabulary / Schema Solving Real Problems with RDF based Linked Data Other Linked Data Posts from this Blog oriented Linked Data Space (goes back a few years!) Various practical Linked Data demo links from my Del.icio.us Bookmark oriented Data Space My personal WebID which is conduit to a Linked Data mesh covering vast variety of things I&#39;ve opted to share with others via the Web (best viewed using a Linked Data aware User Agent like ODE).</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<h3>What is <a href="http://dbpedia.org/resource/Linked_Data" id="link-id138c9aa8">Linked Data</a>?</h3>
<p>
The primary topic of a <a href="http://dbpedia.org/resource/Meme" id="link-id12f86100">meme</a> penned by <a href="http://www.w3.org/People/Berners-Lee/card#i" id="link-id115b4c98">TimBL</a> in the form of a <a href="http://www.w3.org/DesignIssues/LinkedData.html" id="link-id1333f300">Design Issues Doc</a> (note: this is how TimBL has shared his thoughts since the <a href="http://www.w3.org/DesignIssues/" id="link-id1128a1d0">Beginning of the Web</a>).
</p>
<p>
There are a number of dimensions to the meme, but its primary purpose is the reintroduction of the HTTP <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id13c43cb8">URI</a> -- a vital component of the Web&#39;s core architecture. 
</p>
<h3>
What&#39;s Special about HTTP URIs?</h3>
<p>
They possess an intrinsic duality that combines persistent and unambiguous <a href="http://dbpedia.org/resource/Data">Data</a> Identity with platform &amp; representation format independent Data Access. Thus, you can use a string of characters that look like a contemporary Web <a href="http://dbpedia.org/resource/Uniform_Resource_Locator" id="link-id119cd8a0">URL</a> to unambiguously achieve the following:
</p>
<ol>
<li>Identity or Name Anything of Interest</li>
<li>Describe Anything of Interest by associating the Description Subject&#39;s Identity with a constellation of Attribute and Value pairs (technically: an <a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id1133e8a8">Entity</a>-Attribute-Value or Subject-Predicate-Object graph)</li>
<li>Make the Description of Named Things of Interest discoverable on the Web by implicitly binding the aforementioned to Documents that hold their descriptions (technically: metadata documents or <a href="http://dbpedia.org/resource/Information" id="link-id1391da40">information</a> resources)</li> 
</ol>
<h3>What&#39;s the basic value proposition of the <a href="http://www.w3.org/DesignIssues/LinkedData.html" id="link-id113bb690">Linked Data meme</a>?</h3>
<p>Enabling more productive use of the Web by users and developers alike. All of which is achieved by tweaking the Web&#39;s Hyperlinking feature such that it now includes Hypertext and <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1337a3f0">Hyperdata</a> as link types.</p>
<p>Note: Hyperdata Linking is simply what an HTTP URI facilitates.</p> 
<p>Examples problems solved by injecting Linked Data into the Web:</p>
<ol>
<li>Federated Identity by enabling Individuals to unambiguously Identify themselves (Profiles++) courtesy of existing <a href="http://dbpedia.org/resource/Internet" id="link-id13926e28">Internet</a> and Web protocols (e.g., <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id13646ec8">FOAF</a>+SSL&#39;s WebIDs which combine Personal Identity with X.509 certificates and HTTPs based client side certification)</li>
<li>Security and Privacy challenge alleviation by delivering a mechanism for policy based data access that feeds off federated individual identity and social network (graph) traversal</li>
<li>Spam Busting via the above</li>. 
<li>
Increasing the Serendipitous Discovery Quotient (SDQ) of Web accessible resources by embedding Rich Metadata into (X)HTML Documents e.g., structured descriptions of your &quot;WishLists&quot; and &quot;OfferLists&quot; via a common set of terms offered by vocabularies such as <a href="http://www.heppnetz.de/projects/goodrelations/" id="link-id1199b4d0">GoodRelations</a> and <a href="http://dbpedia.org/resource/SIOC" id="link-id1334cfb0">SIOC</a> 
</li>
<li>Coherent integration of disparate data across the Web and/or within the Enterprise via &quot;Data Meshing&quot; rather than &quot;Data Mashing&quot;</li>
<li>Moving beyond imprecise statistically driven &quot;Keyword Search&quot; (e.g. Page Rank) to &quot;Precision Find&quot; driven by typed link based <a href="http://dbpedia.org/resource/Entity" id="link-id135f6fe8">Entity</a> Rank plus Entity Type and Entity Property filters.</li> 
</ol>
<h3>Conclusion</h3>
<p>If all of the above still falls into the technical mumbo-jumbo realm, then simply consider Linked Data as delivering Open Data Access in granular form to Web accessible data -- that goes beyond data containers (documents or files).</p> 
<p>The value proposition of Linked Data is inextricably linked to the value proposition of the <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id1356f5c0">World Wide Web</a>. This is true, because the Linked Data meme is ultimately about an enhancement of the current Web; achieved by reintroducing its architectural essence -- in new <a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id11300828">context</a> -- via a new level of link abstraction, courtesy of the Identity and Access duality of HTTP URIs.</p> 
<p>As a result of Linked Data, you can now have Links on the Web for a Person, Document, Music, Consumer Electronics, Products &amp; Services, Business Opening &amp; Closing Hours, Personal &quot;WishLists&quot; and &quot;OfferList&quot;, an Idea, etc.. in addition to links for Properties (Attributes &amp; Values) of the aforementioned. Ultimately, all of these links will be indexed in a myriad of ways providing the substrate for the next major period of Internet &amp; Web driven innovation, within our larger human-ingenuity driven innovation continuum.</p>
<h3>Related</h3>
<ul>
<li>
  <a href="http://www.ebusiness-unibw.org/wiki/GoodRelations#Recipes_and_Examples" id="link-id11386648">Recipes for Describing Your Business and its Offerings using the GoodRelations Vocabulary / Schema</a>
</li>
<li>
  <a href="http://slidesix.com/view/SolvingRealProblemsUsingLinkedData" id="link-id13658ee0">Solving Real Problems with RDF based Linked Data</a>
</li>
<li>
  <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&amp;q=linked%20data&amp;type=text&amp;output=html" id="link-id1175a650">Other Linked Data Posts from this Blog oriented Linked Data Space</a> (goes back a few years!)</li>
<li>Various practical <a href="http://delicious.com/kidehen/linked_data_demo" id="link-id13390cf8">Linked Data demo links from my Del.icio.us Bookmark oriented Data Space</a>
</li>
<li>
  <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id132cda80">My personal WebID</a> which is conduit to a Linked Data mesh covering vast variety of things I&#39;ve opted to share with others via the Web (best viewed using a Linked Data aware User Agent like ODE).</li>
</ul>









]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-04-22#1542">
  <rss:title>Take N: Yet Another OpenLink Data Spaces Introduction</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-04-22T18:46:18Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Problem: Your Life, Profession, Web, and Internet do not need to become mutually exclusive due to &quot;information overload&quot;. Solution: A platform or service that delivers a point of online presence that embodies the fundamental separation of: Identity, Data Access, Data Representation, Data Presentation, by adhering to Web and Internet protocols. How: Typical post installation (Local or Cloud) task sequence: Identify myself (happens automatically by way of registration) If in an LDAP environment, import accounts or associate system with LDAP for account lookup and authentication Identify Online Accounts (by fleshing out profile) which also connects system to online accounts and their data Use Profile for granular description (Biography, Interests, WishList, OfferList, etc.) Optionally upstream or downstream data to and from my online accounts Create content Tagging Rules Create rules for associating Tags with formal URIs Create automatic Hyperlinking Rules for reuse when new content is created (e.g. Blog posts) Exploit Data Portability virtues of RSS, Atom, OPML, RDFa, RDF/XML, and other formats for imports and exports Automatically tag imported content Use function-specific helper application UIs for domain specific data generation e.g. AddressBook (optionally use vCard import), Calendar (optionally use iCalendar import), Email, File Storage (use WebDAV mount with copy and paste or HTTP GET), Feed Subscriptions (optionally import RSS/Atom/OPML feeds), Bookmarking (optionally import bookmark.html or XBEL) etc.. Optionally enable &quot;Conversation&quot; feature (today: Social Media feature) across the relevant application domains (manage conversations under covers using NNTP, the standard for this functionality realm) Generate HTTP based Entity IDs (URIs) for every piece of data in this burgeoning data space Use REST based APIs to perform CRUD tasks against my data (local and remote) (SPARQL, GData, Ubiquity Commands, Atom Publishing) Use OpenID, OAuth, FOAF+SSL, FOAF+SSL+OpenID for accessing data elsewhere Use OpenID, OAuth, FOAF+SSL, FOAF+SSL+OpenID for Controlling access to my data (Self Signed Certificate Generation, Browser Import of said Certificate &amp; associated Private Key, plus persistence of Certificate to FOAF based profile data space in &quot;one click&quot;) Have a simple UI for Entity-Attribute-Value or Subject-Predicate-Object arbitrary data annotations and creation since you can&#39;t pre model an &quot;Open World&quot; where the only constant is data flow Have my Personal URI (Web ID) as the single entry point for controlled access to my HTTP accessible data space I&#39;ve just outlined a snippet of the capabilities of the OpenLink Data Spaces platform. A platform built using OpenLink Virtuoso, architected to deliver: open, platform independent, multi-model, data access and data management across heterogeneous data sources. All you need to remember is your URI when seeking to interact with your data space. Related Get Yourself a URI (Web ID) in 5 Minutes or Less! Various posts over the years about Data Spaces Future of Desktop Post Simplify My Life Post by Bengee Nowack</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<h3>Problem:</h3>
<p>Your Life, Profession, <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a>, and <a href="http://dbpedia.org/resource/Internet" id="link-id0x1c6687f8">Internet</a> do not need to become mutually exclusive due to &quot;<a href="http://dbpedia.org/resource/Information" id="link-id0x1c6696e8">information</a> overload&quot;.</p>

<h3>Solution:</h3>
<p>
A platform or service that delivers a point of online presence that embodies the fundamental separation of: Identity, <a href="http://dbpedia.org/resource/Data">Data</a> Access, Data Representation, Data Presentation, by adhering to Web and Internet protocols.</p>

<h3>How:</h3>
<p>
Typical post installation (Local or Cloud) task sequence:</p>
<ol>
<li>
Identify myself (happens automatically by way of registration)</li>
<li>If in an LDAP environment, import accounts or associate system with LDAP for account lookup and authentication</li>
<li>
Identify Online Accounts (by fleshing out profile) which also connects system to online accounts and their data</li>
<li>Use Profile for granular description (Biography, Interests, WishList, OfferList, etc.)</li>
<li>Optionally upstream or downstream data to and from my online accounts</li>
<li>Create content Tagging Rules</li>
<li>Create rules for associating Tags with formal URIs</li>
<li>Create automatic Hyperlinking Rules for reuse when new content is created (e.g. <a href="http://dbpedia.org/resource/Blog" id="link-id11a7c660">Blog</a> posts)</li>
<li>Exploit Data Portability virtues of RSS, Atom, OPML, <a href="http://dbpedia.org/resource/RDFa" id="link-id13f54d50">RDFa</a>, RDF/XML, and other formats for imports and exports</li> 
<li>Automatically <a href="http://dbpedia.org/resource/Tag" id="link-id121ddff0">tag</a> imported content</li>
<li>Use function-specific helper application UIs for domain specific data generation e.g. AddressBook (optionally use vCard import), Calendar (optionally use iCalendar import), Email, File Storage (use WebDAV mount with copy and paste or HTTP GET), Feed Subscriptions (optionally import RSS/Atom/OPML feeds), Bookmarking (optionally import bookmark.html or XBEL) etc..</li>
<li>Optionally enable &quot;Conversation&quot; feature (today: Social Media feature) across the relevant application domains (manage conversations under covers using NNTP, the standard for this functionality realm)
</li>
<li>Generate HTTP based <a href="http://dbpedia.org/resource/Entity" id="link-id13d5d378">Entity</a> IDs (URIs) for every piece of data in this burgeoning <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id11a69670">data space</a>
</li>
<li>Use REST based APIs to perform CRUD tasks against my data (local and remote) (<a href="http://dbpedia.org/resource/SPARQL" id="link-id11a76e10">SPARQL</a>, GData, Ubiquity Commands, Atom Publishing)</li> 

<li>Use OpenID, OAuth, <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id11c9b3e0">FOAF</a>+SSL, FOAF+SSL+OpenID for accessing data elsewhere</li>
<li>Use OpenID, OAuth, FOAF+SSL, FOAF+SSL+OpenID for Controlling access to my data (Self Signed Certificate Generation, Browser Import of said Certificate &amp; associated Private Key, plus persistence of Certificate to FOAF based profile data space in &quot;one click&quot;)</li>
<li>Have a simple UI for <a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id14015bd0">Entity</a>-Attribute-Value or Subject-Predicate-Object arbitrary data annotations and creation since you can&#39;t pre model an &quot;<a href="http://dbpedia.org/resource/Open_world_assumption" id="link-id11cd8548">Open World</a>&quot; where the only constant is data flow</li>
<li>Have my Personal <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id142beee8">URI</a> (Web ID) as the single entry point for controlled access to my HTTP accessible data space</li>
</ol>
<p>
I&#39;ve just outlined a snippet of the capabilities of the <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id13d64740">OpenLink Data Spaces</a> platform. A platform built using OpenLink <a href="http://virtuoso.openlinksw.com" id="link-id13d74170">Virtuoso</a>, architected to deliver: open, platform independent, multi-model, data access and data management across heterogeneous data sources.
</p>
<p>
All you need to remember is your URI when seeking to interact with your data space.</p>

<h3>Related</h3>
<ol>
<li>
  <a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/GetAPersonalURIIn5MinutesOrLess" id="link-id13c97948">Get Yourself a URI (Web ID) in 5 Minutes or Less!</a>
</li>
<li>
<a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&amp;q=%22data%20spaces%22&amp;type=text&amp;output=html" id="link-id1431e088">Various posts over the years about Data Spaces</a>
</li>
<li>
  <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1415" id="link-id11f837f0">Future of Desktop Post</a>
</li>
 <li>
  <a href="http://bnode.org/blog/2009/04/22/semantic-web-apps-to-simplify-my-life" id="link-id1393f8a8">Simplify My Life Post</a> by <a href="http://bnode.org/about" id="link-id11da0cc8">Bengee Nowack</a>
 </li>
</ol>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2009-04-01#1541">
  <rss:title>Web Scale and Fault Tolerance</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-04-01T15:18:06Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">One concern about Virtuoso Cluster is fault tolerance. This post talks about the basics of fault tolerance and what we can do with this, from improving resilience and optimizing performance to accommodating bulk loads without impacting interactive response. We will see that this is yet another step towards a 24/7 web-scale Linked Data Web. We will see how large scale, continuous operation, and redundancy are related. It has been said many times â when things are large enough, failures become frequent. In view of this, basic storage of partitions in multiple copies is built into the Virtuoso cluster from the start. Until now, this feature has not been tested or used very extensively, aside from the trivial case of keeping all schema information in synchronous replicas on all servers. Approaches to Fault Tolerance Fault tolerance has many aspects but it starts with keeping data in at least two copies. There are shared-disk cluster databases like Oracle RAC that do not depend on partitioning. With these, as long as the disk image is intact, servers can come and go. The fault tolerance of the disk in turn comes from mirroring done by the disk controller. Raids other than mirrored disk are not really good for databases because of write speed. With shared-nothing setups like Virtuoso, fault tolerance is based on multiple servers keeping the same logical data. The copies are synchronized transaction-by-transaction but are not bit-for-bit identical nor write-by-write synchronous as is the case with mirrored disks. There are asynchronous replication schemes generally based on log shipping, where the replica replays the transaction log of the master copy. The master copy gets the updates, the replica replays them. Both can take queries. These do not guarantee an entirely ACID fail-over but for many applications they come close enough. In a tightly coupled cluster, it is possible to do synchronous, transactional updates on multiple copies without great added cost. Sending the message to two places instead of one does not make much difference since it is the latency that counts. But once we go to wide area networks, this becomes as good as unworkable for any sort of update volume. Thus, wide area replication must in practice be asynchronous. This is a subject for another discussion. For now, the short answer is that wide area log shipping must be adapted to the application&#39;s requirements for synchronicity and consistency. Also, exactly what content is shipped and to where depends on the application. Some application-specific logic will likely be involved; more than this one cannot say without a specific context. Basics of Partition Fail-Over For now, we will be concerned with redundancy protecting against broken hardware, software slowdown, or crashes inside a single site. The basic idea is simple: Writes go to all copies; reads that must be repeatable or serializable (i.e., locking) go to the first copy; reads that refer to committed state without guarantee of repeatability can be balanced among all copies. When a copy goes offline, nobody needs to know, as long as there is at least one copy online for each partition. The exception in practice is when there are open cursors or such stateful things as aggregations pending on a copy that goes down. Then the query or transaction will abort and the application can retry. This looks like a deadlock to the application. Coming back online is more complicated. This requires establishing that the recovering copy is actually in sync. In practice this requires a short window during which no transactions have uncommitted updates. Sometimes, forcing this can require aborting some transactions, which again looks like a deadlock to the application. When an error is seen, such as a process no longer accepting connections and dropping existing cluster connections, we in practice go via two stages. First, the operations that directly depended on this process are aborted, as well as any computation being done on behalf of the disconnected server. At this stage, attempting to read data from the partition of the failed server will go to another copy but writes will still try to update all copies and will fail if the failed copy continues to be offline. After it is established that the failed copy will stay off for some time, writes may be re-enabled â but now having the failed copy rejoin the cluster will be more complicated, requiring an atomic window to ensure sync, as mentioned earlier. For the DBA, there can be intermittent software crashes where a failed server automatically restarts itself, and there can be prolonged failures where this does not happen. Both are alerts but the first kind can wait. Since a system must essentially run itself, it will wait for some time for the failed server to restart itself. During this window, all reads of the failed partition go to the spare copy and writes give an error. If the spare does not come back up in time, the system will automatically re-enable writes on the spare but now the failed server may no longer rejoin the cluster without a complex sync cycle. This all can happen in well under a minute, faster than a human operator can react. The diagnostics can be done later. If the situation was a hardware failure, recovery consists of taking a spare server and copying the database from the surviving online copy. This done, the spare server can come on line. Copying the database can be done while online and accepting updates but this may take some time, maybe an hour for every 200G of data copied over a network. In principle this could be automated by scripting, but we would normally expect a human DBA to be involved. As a general rule, reacting to the failure goes automatically without disruption of service but bringing the failed copy online will usually require some operator action. Levels of Tolerance and Performance The only way to make failures totally invisible is to have all in duplicate and provisioned so that the system never runs at more than half the total capacity. This is often not economical or necessary. This is why we can do better, using the spare capacity for more than standby. Imagine keeping a repository of linked data. Most of the content will come in through periodic bulk replacement of data sets. Some data will come in through pings from applications publishing FOAF and similar. Some data will come through on-demand RDFization of resources. The performance of such a repository essentially depends on having enough memory. Having this memory in duplicate is just added cost. What we can do instead is have all copies store the whole partition but when routing queries, apply range partitioning on top of the basic hash partitioning. If one partition stores IDs 64K - 128K, the next partition 128K - 192K, and so forth, and all partitions are stored in two full copies, we can route reads to the first 32K IDs to the first copy and reads to the second 32K IDs to the second copy. In this way, the copies will keep different working sets. The RAM is used to full advantage. Of course, if there is a failure, then the working set will degrade, but if this is not often and not for long, this can be quite tolerable. The alternate expense is buying twice as much RAM, likely meaning twice as many servers. This workload is memory intensive, thus servers should have the maximum memory they can have without going to parts that are so expensive one gets a new server for the price of doubling memory. Background Bulk Processing When loading data, the system is online in principle, but query response can be quite bad. A large RDF load will involve most memory and queries will miss the cache. The load will further keep most disks busy, so response is not good. This is the case as soon as a server&#39;s partition of the database is four times the size of RAM or greater. Whether the work is bulk-load or bulk-delete makes little difference. But if partitions are replicated, we can temporarily split the database so that the first copies serve queries and the second copies do the load. If the copies serving on line activities do some updates also, these updates will be committed on both copies. But the load will be committed on the second copy only. This is fully appropriate as long as the data are different. When the bulk load is done, the second copy of each partition will have the full up to date state, including changes that came in during the bulk load. The online activity can be now redirected to the second copies and the first copies can be overwritten in the background by the second copies, so as to again have all data in duplicate. Failures during such operations are not dangerous. If the copies doing the bulk load fail, the bulk load will have to be restarted. If the front end copies fail, the front end load goes to the copies doing the bulk load. Response times will be bad until the bulk load is stopped, but no data is lost. This technique applies to all data intensive background tasks â calculation of entity search ranks, data cleansing, consistency checking, and so on. If two copies are needed to keep up with the online load, then data can be kept just as well in three copies instead of two. This method applies to any data-warehouse-style workload which must coexist with online access and occasional low volume updating. Configurations of Redundancy Right now, we can declare that two or more server processes in a cluster form a group. All data managed by one member of the group is stored by all others. The members of the group are interchangeable. Thus, if there is four-servers-worth of data, then there will be a minimum of eight servers. Each of these servers will have one server process per core. The first hardware failure will not affect operations. For the second failure, there is a 1/7 chance that it stops the whole system, if it falls on the server whose pair is down. If groups consist of three servers, for a total of 12, the two first failures are guaranteed not to interrupt operations; for the third, there is a 1/10 chance that it will. We note that for big databases, as said before, the RAM cache capacity is the sum of all the servers&#39; RAM when in normal operation. There are other, more dynamic ways of splitting data among servers, so that partitions migrate between servers and spawn extra copies of themselves if not enough copies are online. The Google File System (GFS) does something of this sort at the file system level; Amazon&#39;s Dynamo does something similar at the database level. The analogies are not exact, though. If data is partitioned in this manner, for example into 1K slices, each in duplicate, with the rule that the two duplicates will not be on the same physical server, the first failure will not break operations but the second probably will. Without extra logic, there is a probability that the partitions formerly hosted by the failed server have their second copies randomly spread over the remaining servers. This scheme equalizes load better but is less resilient. Maintenance and Continuity Databases may benefit from defragmentation, rebalancing of indices, and so on. While these are possible online, by definition they affect the working set and make response times quite bad as soon as the database is significantly larger than RAM. With duplicate copies, the problem is largely solved. Also, software version changes need not involve downtime. Present Status The basics of replicated partitions are operational. The items to finalize are about system administration procedures and automatic synchronization of recovering copies. This must be automatic because if it is not, the operator will find a way to forget something or do some steps in the wrong order. This also requires a management view that shows what the different processes are doing and whether something is hung or failing repeatedly. All this is for the recovery part; taking failed partitions offline is easy.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>One concern about <a href="http://virtuoso.openlinksw.com" id="link-id0x719d2f8">Virtuoso</a> Cluster is fault tolerance. This post talks about the basics of fault tolerance and what we can do with this, from improving resilience and optimizing performance to accommodating bulk loads without impacting interactive response. We will see that this is yet another step towards a 24/7 web-scale <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xa9a1d8d8">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x25201030">Web</a>. We will see how large scale, continuous operation, and redundancy are related.</p>

<p>It has been said many times â when things are large enough, failures become frequent. In view of this, basic storage of partitions in multiple copies is built into the Virtuoso cluster from the start. Until now, this feature has not been tested or used very extensively, aside from the trivial case of keeping all schema <a href="http://dbpedia.org/resource/Information" id="link-id0x4548898">information</a> in synchronous replicas on all servers.</p>

<h2>Approaches to Fault Tolerance</h2>

<p>Fault tolerance has many aspects but it starts with keeping <a href="http://dbpedia.org/resource/Data" id="link-id0x18757400">data</a> in at least two copies. There are shared-disk cluster databases like <a href="http://dbpedia.org/resource/Oracle_Database" id="link-id0x711c900">Oracle</a> RAC that do not depend on partitioning. With these, as long as the disk image is intact, servers can come and go. The fault tolerance of the disk in turn comes from mirroring done by the disk controller. Raids other than mirrored disk are not really good for databases because of write speed.</p>

<p>With shared-nothing setups like Virtuoso, fault tolerance is based on multiple servers keeping the same logical data. The copies are synchronized transaction-by-transaction but are not bit-for-bit identical nor write-by-write synchronous as is the case with mirrored disks.</p>

<p>There are asynchronous replication schemes generally based on log shipping, where the replica replays the transaction log of the master copy. The master copy gets the updates, the replica replays them. Both can take queries. These do not guarantee an entirely ACID fail-over but for many applications they come close enough.</p>

<p>In a tightly coupled cluster, it is possible to do synchronous, transactional updates on multiple copies without great added cost. Sending the message to two places instead of one does not make much difference since it is the latency that counts. But once we go to wide area networks, this becomes as good as unworkable for any sort of update volume. Thus, wide area replication must in practice be asynchronous.</p>

<p>This is a subject for another discussion. For now, the short answer is that wide area log shipping must be adapted to the application&#39;s requirements for synchronicity and consistency. Also, exactly what content is shipped and to where depends on the application. Some application-specific logic will likely be involved; more than this one cannot say without a specific context.</p>

<h2>Basics of Partition Fail-Over</h2>

<p>For now, we will be concerned with redundancy protecting against broken hardware, software slowdown, or crashes inside a single site.</p>

<p>The basic idea is simple: Writes go to all copies; reads that must be repeatable or serializable (i.e., locking) go to the first copy; reads that refer to committed state without guarantee of repeatability can be balanced among all copies. When a copy goes offline, nobody needs to know, as long as there is at least one copy online for each partition. The exception in practice is when there are open cursors or such stateful things as aggregations pending on a copy that goes down. Then the query or transaction will abort and the application can retry. This looks like a deadlock to the application.</p>

<p>Coming back online is more complicated. This requires establishing that the recovering copy is actually in sync. In practice this requires a short window during which no transactions have uncommitted updates. Sometimes, forcing this can require aborting some transactions, which again looks like a deadlock to the application.</p>

<p>When an error is seen, such as a process no longer accepting connections and dropping existing cluster connections, we in practice go via two stages. First, the operations that directly depended on this process are aborted, as well as any computation being done on behalf of the disconnected server. At this stage, attempting to read data from the partition of the failed server will go to another copy but writes will still try to update all copies and will fail if the failed copy continues to be offline. After it is established that the failed copy will stay off for some time, writes may be re-enabled â but now having the failed copy rejoin the cluster will be more complicated, requiring an atomic window to ensure sync, as mentioned earlier.</p>

<p>For the DBA, there can be intermittent software crashes where a failed server automatically restarts itself, and there can be prolonged failures where this does not happen. Both are alerts but the first kind can wait. Since a system must essentially run itself, it will wait for some time for the failed server to restart itself. During this window, all reads of the failed partition go to the spare copy and writes give an error. If the spare does not come back up in time, the system will automatically re-enable writes on the spare but now the failed server may no longer rejoin the cluster without a complex sync cycle. This all can happen in well under a minute, faster than a human operator can react. The diagnostics can be done later.</p>

<p>If the situation was a hardware failure, recovery consists of taking a spare server and copying the database from the surviving online copy. This done, the spare server can come on line. Copying the database can be done while online and accepting updates but this may take some time, maybe an hour for every 200G of data copied over a network. In principle this could be automated by scripting, but we would normally expect a human DBA to be involved.</p>

<p>As a general rule, reacting to the failure goes automatically without disruption of service but bringing the failed copy online will usually require some operator action.</p>

<h2>Levels of Tolerance and Performance</h2>

<p>The only way to make failures totally invisible is to have all in duplicate and provisioned so that the system never runs at more than half the total capacity. This is often not economical or necessary. This is why we can do better, using the spare capacity for more than standby.</p>

<p>Imagine keeping a repository of linked data. Most of the content will come in through periodic bulk replacement of data sets. Some data will come in through pings from applications publishing FOAF and similar. Some data will come through on-demand RDFization of resources.</p>

<p>The performance of such a repository essentially depends on having enough memory. Having this memory in duplicate is just added cost. What we can do instead is have all copies store the whole partition but when routing queries, apply range partitioning on top of the basic hash partitioning. If one partition stores IDs 64K - 128K, the next partition 128K - 192K, and so forth, and all partitions are stored in two full copies, we can route reads to the first 32K IDs to the first copy and reads to the second 32K IDs to the second copy. In this way, the copies will keep different working sets. The RAM is used to full advantage.</p>

<p>Of course, if there is a failure, then the working set will degrade, but if this is not often and not for long, this can be quite tolerable. The alternate expense is buying twice as much RAM, likely meaning twice as many servers. This workload is memory intensive, thus servers should have the maximum memory they can have without going to parts that are so expensive one gets a new server for the price of doubling memory.</p>

<h2>Background Bulk Processing</h2>

<p>When loading data, the system is online in principle, but query response can be quite bad. A large <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x19fd9c18">RDF</a> load will involve most memory and queries will miss the cache. The load will further keep most disks busy, so response is not good. This is the case as soon as a server&#39;s partition of the database is four times the size of RAM or greater. Whether the work is bulk-load or bulk-delete makes little difference.</p>

<p>But if partitions are replicated, we can temporarily split the database so that the first copies serve queries and the second copies do the load. If the copies serving on line activities do some updates also, these updates will be committed on both copies. But the load will be committed on the second copy only. This is fully appropriate as long as the data are different. When the bulk load is done, the second copy of each partition will have the full up to date state, including changes that came in during the bulk load. The online activity can be now redirected to the second copies and the first copies can be overwritten in the background by the second copies, so as to again have all data in duplicate.</p>

<p>Failures during such operations are not dangerous. If the copies doing the bulk load fail, the bulk load will have to be restarted. If the front end copies fail, the front end load goes to the copies doing the bulk load. Response times will be bad until the bulk load is stopped, but no data is lost.</p>

<p>This technique applies to all data intensive background tasks â calculation of <a href="http://dbpedia.org/resource/Entity" id="link-id0x20b7a568">entity</a> search ranks, data cleansing, consistency checking, and so on. If two copies are needed to keep up with the online load, then data can be kept just as well in three copies instead of two. This method applies to any data-warehouse-style workload which must coexist with online access and occasional low volume updating.</p>

<h2>Configurations of Redundancy</h2>

<p>Right now, we can declare that two or more server processes in a cluster form a group. All data managed by one member of the group is stored by all others. The members of the group are interchangeable. Thus, if there is four-servers-worth of data, then there will be a minimum of eight servers. Each of these servers will have one server process per core. The first hardware failure will not affect operations. For the second failure, there is a 1/7 chance that it stops the whole system, if it falls on the server whose pair is down. If groups consist of three servers, for a total of 12, the two first failures are guaranteed not to interrupt operations; for the third, there is a 1/10 chance that it will.</p>

<p>We note that for big databases, as said before, the RAM cache capacity is the sum of all the servers&#39; RAM when in normal operation.</p>

<p>There are other, more dynamic ways of splitting data among servers, so that partitions migrate between servers and spawn extra copies of themselves if not enough copies are online. The Google File System (GFS) does something of this sort at the file system level; Amazon&#39;s Dynamo does something similar at the database level. The analogies are not exact, though.</p>

<p>If data is partitioned in this manner, for example into 1K slices, each in duplicate, with the rule that the two duplicates will not be on the same physical server, the first failure will not break operations but the second probably will. Without extra logic, there is a probability that the partitions formerly hosted by the failed server have their second copies randomly spread over the remaining servers. This scheme equalizes load better but is less resilient.</p>

<h2>Maintenance and Continuity</h2>

<p>Databases may benefit from defragmentation, rebalancing of indices, and so on. While these are possible online, by definition they affect the working set and make response times quite bad as soon as the database is significantly larger than RAM. With duplicate copies, the problem is largely solved. Also, software version changes need not involve downtime.</p>

<h2>Present Status</h2>

<p>The basics of replicated partitions are operational. The items to finalize are about system administration procedures and automatic synchronization of recovering copies. This must be automatic because if it is not, the operator will find a way to forget something or do some steps in the wrong order. This also requires a management view that shows what the different processes are doing and whether something is hung or failing repeatedly. All this is for the recovery part; taking failed partitions offline is easy.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2009-04-01#1540">
  <rss:title>Web Scale and Fault Tolerance</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-04-01T15:18:06Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">One concern about Virtuoso Cluster is fault tolerance. This post talks about the basics of fault tolerance and what we can do with this, from improving resilience and optimizing performance to accommodating bulk loads without impacting interactive response. We will see that this is yet another step towards a 24/7 web-scale Linked Data Web. We will see how large scale, continuous operation, and redundancy are related. It has been said many times â when things are large enough, failures become frequent. In view of this, basic storage of partitions in multiple copies is built into the Virtuoso cluster from the start. Until now, this feature has not been tested or used very extensively, aside from the trivial case of keeping all schema information in synchronous replicas on all servers. Approaches to Fault Tolerance Fault tolerance has many aspects but it starts with keeping data in at least two copies. There are shared-disk cluster databases like Oracle RAC that do not depend on partitioning. With these, as long as the disk image is intact, servers can come and go. The fault tolerance of the disk in turn comes from mirroring done by the disk controller. Raids other than mirrored disk are not really good for databases because of write speed. With shared-nothing setups like Virtuoso, fault tolerance is based on multiple servers keeping the same logical data. The copies are synchronized transaction-by-transaction but are not bit-for-bit identical nor write-by-write synchronous as is the case with mirrored disks. There are asynchronous replication schemes generally based on log shipping, where the replica replays the transaction log of the master copy. The master copy gets the updates, the replica replays them. Both can take queries. These do not guarantee an entirely ACID fail-over but for many applications they come close enough. In a tightly coupled cluster, it is possible to do synchronous, transactional updates on multiple copies without great added cost. Sending the message to two places instead of one does not make much difference since it is the latency that counts. But once we go to wide area networks, this becomes as good as unworkable for any sort of update volume. Thus, wide area replication must in practice be asynchronous. This is a subject for another discussion. For now, the short answer is that wide area log shipping must be adapted to the application&#39;s requirements for synchronicity and consistency. Also, exactly what content is shipped and to where depends on the application. Some application-specific logic will likely be involved; more than this one cannot say without a specific context. Basics of Partition Fail-Over For now, we will be concerned with redundancy protecting against broken hardware, software slowdown, or crashes inside a single site. The basic idea is simple: Writes go to all copies; reads that must be repeatable or serializable (i.e., locking) go to the first copy; reads that refer to committed state without guarantee of repeatability can be balanced among all copies. When a copy goes offline, nobody needs to know, as long as there is at least one copy online for each partition. The exception in practice is when there are open cursors or such stateful things as aggregations pending on a copy that goes down. Then the query or transaction will abort and the application can retry. This looks like a deadlock to the application. Coming back online is more complicated. This requires establishing that the recovering copy is actually in sync. In practice this requires a short window during which no transactions have uncommitted updates. Sometimes, forcing this can require aborting some transactions, which again looks like a deadlock to the application. When an error is seen, such as a process no longer accepting connections and dropping existing cluster connections, we in practice go via two stages. First, the operations that directly depended on this process are aborted, as well as any computation being done on behalf of the disconnected server. At this stage, attempting to read data from the partition of the failed server will go to another copy but writes will still try to update all copies and will fail if the failed copy continues to be offline. After it is established that the failed copy will stay off for some time, writes may be re-enabled â but now having the failed copy rejoin the cluster will be more complicated, requiring an atomic window to ensure sync, as mentioned earlier. For the DBA, there can be intermittent software crashes where a failed server automatically restarts itself, and there can be prolonged failures where this does not happen. Both are alerts but the first kind can wait. Since a system must essentially run itself, it will wait for some time for the failed server to restart itself. During this window, all reads of the failed partition go to the spare copy and writes give an error. If the spare does not come back up in time, the system will automatically re-enable writes on the spare but now the failed server may no longer rejoin the cluster without a complex sync cycle. This all can happen in well under a minute, faster than a human operator can react. The diagnostics can be done later. If the situation was a hardware failure, recovery consists of taking a spare server and copying the database from the surviving online copy. This done, the spare server can come on line. Copying the database can be done while online and accepting updates but this may take some time, maybe an hour for every 200G of data copied over a network. In principle this could be automated by scripting, but we would normally expect a human DBA to be involved. As a general rule, reacting to the failure goes automatically without disruption of service but bringing the failed copy online will usually require some operator action. Levels of Tolerance and Performance The only way to make failures totally invisible is to have all in duplicate and provisioned so that the system never runs at more than half the total capacity. This is often not economical or necessary. This is why we can do better, using the spare capacity for more than standby. Imagine keeping a repository of linked data. Most of the content will come in through periodic bulk replacement of data sets. Some data will come in through pings from applications publishing FOAF and similar. Some data will come through on-demand RDFization of resources. The performance of such a repository essentially depends on having enough memory. Having this memory in duplicate is just added cost. What we can do instead is have all copies store the whole partition but when routing queries, apply range partitioning on top of the basic hash partitioning. If one partition stores IDs 64K - 128K, the next partition 128K - 192K, and so forth, and all partitions are stored in two full copies, we can route reads to the first 32K IDs to the first copy and reads to the second 32K IDs to the second copy. In this way, the copies will keep different working sets. The RAM is used to full advantage. Of course, if there is a failure, then the working set will degrade, but if this is not often and not for long, this can be quite tolerable. The alternate expense is buying twice as much RAM, likely meaning twice as many servers. This workload is memory intensive, thus servers should have the maximum memory they can have without going to parts that are so expensive one gets a new server for the price of doubling memory. Background Bulk Processing When loading data, the system is online in principle, but query response can be quite bad. A large RDF load will involve most memory and queries will miss the cache. The load will further keep most disks busy, so response is not good. This is the case as soon as a server&#39;s partition of the database is four times the size of RAM or greater. Whether the work is bulk-load or bulk-delete makes little difference. But if partitions are replicated, we can temporarily split the database so that the first copies serve queries and the second copies do the load. If the copies serving on line activities do some updates also, these updates will be committed on both copies. But the load will be committed on the second copy only. This is fully appropriate as long as the data are different. When the bulk load is done, the second copy of each partition will have the full up to date state, including changes that came in during the bulk load. The online activity can be now redirected to the second copies and the first copies can be overwritten in the background by the second copies, so as to again have all data in duplicate. Failures during such operations are not dangerous. If the copies doing the bulk load fail, the bulk load will have to be restarted. If the front end copies fail, the front end load goes to the copies doing the bulk load. Response times will be bad until the bulk load is stopped, but no data is lost. This technique applies to all data intensive background tasks â calculation of entity search ranks, data cleansing, consistency checking, and so on. If two copies are needed to keep up with the online load, then data can be kept just as well in three copies instead of two. This method applies to any data-warehouse-style workload which must coexist with online access and occasional low volume updating. Configurations of Redundancy Right now, we can declare that two or more server processes in a cluster form a group. All data managed by one member of the group is stored by all others. The members of the group are interchangeable. Thus, if there is four-servers-worth of data, then there will be a minimum of eight servers. Each of these servers will have one server process per core. The first hardware failure will not affect operations. For the second failure, there is a 1/7 chance that it stops the whole system, if it falls on the server whose pair is down. If groups consist of three servers, for a total of 12, the two first failures are guaranteed not to interrupt operations; for the third, there is a 1/10 chance that it will. We note that for big databases, as said before, the RAM cache capacity is the sum of all the servers&#39; RAM when in normal operation. There are other, more dynamic ways of splitting data among servers, so that partitions migrate between servers and spawn extra copies of themselves if not enough copies are online. The Google File System (GFS) does something of this sort at the file system level; Amazon&#39;s Dynamo does something similar at the database level. The analogies are not exact, though. If data is partitioned in this manner, for example into 1K slices, each in duplicate, with the rule that the two duplicates will not be on the same physical server, the first failure will not break operations but the second probably will. Without extra logic, there is a probability that the partitions formerly hosted by the failed server have their second copies randomly spread over the remaining servers. This scheme equalizes load better but is less resilient. Maintenance and Continuity Databases may benefit from defragmentation, rebalancing of indices, and so on. While these are possible online, by definition they affect the working set and make response times quite bad as soon as the database is significantly larger than RAM. With duplicate copies, the problem is largely solved. Also, software version changes need not involve downtime. Present Status The basics of replicated partitions are operational. The items to finalize are about system administration procedures and automatic synchronization of recovering copies. This must be automatic because if it is not, the operator will find a way to forget something or do some steps in the wrong order. This also requires a management view that shows what the different processes are doing and whether something is hung or failing repeatedly. All this is for the recovery part; taking failed partitions offline is easy.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>One concern about <a href="http://virtuoso.openlinksw.com" id="link-id0x3b82c38">Virtuoso</a> Cluster is fault tolerance. This post talks about the basics of fault tolerance and what we can do with this, from improving resilience and optimizing performance to accommodating bulk loads without impacting interactive response. We will see that this is yet another step towards a 24/7 web-scale <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x22c42e10">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x1e4f0b58">Web</a>. We will see how large scale, continuous operation, and redundancy are related.</p>

<p>It has been said many times â when things are large enough, failures become frequent. In view of this, basic storage of partitions in multiple copies is built into the Virtuoso cluster from the start. Until now, this feature has not been tested or used very extensively, aside from the trivial case of keeping all schema <a href="http://dbpedia.org/resource/Information" id="link-id0x224401c0">information</a> in synchronous replicas on all servers.</p>

<h2>Approaches to Fault Tolerance</h2>

<p>Fault tolerance has many aspects but it starts with keeping <a href="http://dbpedia.org/resource/Data" id="link-id0x230b7500">data</a> in at least two copies. There are shared-disk cluster databases like <a href="http://dbpedia.org/resource/Oracle_Database" id="link-id0xa9a1d8d8">Oracle</a> RAC that do not depend on partitioning. With these, as long as the disk image is intact, servers can come and go. The fault tolerance of the disk in turn comes from mirroring done by the disk controller. Raids other than mirrored disk are not really good for databases because of write speed.</p>

<p>With shared-nothing setups like Virtuoso, fault tolerance is based on multiple servers keeping the same logical data. The copies are synchronized transaction-by-transaction but are not bit-for-bit identical nor write-by-write synchronous as is the case with mirrored disks.</p>

<p>There are asynchronous replication schemes generally based on log shipping, where the replica replays the transaction log of the master copy. The master copy gets the updates, the replica replays them. Both can take queries. These do not guarantee an entirely ACID fail-over but for many applications they come close enough.</p>

<p>In a tightly coupled cluster, it is possible to do synchronous, transactional updates on multiple copies without great added cost. Sending the message to two places instead of one does not make much difference since it is the latency that counts. But once we go to wide area networks, this becomes as good as unworkable for any sort of update volume. Thus, wide area replication must in practice be asynchronous.</p>

<p>This is a subject for another discussion. For now, the short answer is that wide area log shipping must be adapted to the application&#39;s requirements for synchronicity and consistency. Also, exactly what content is shipped and to where depends on the application. Some application-specific logic will likely be involved; more than this one cannot say without a specific context.</p>

<h2>Basics of Partition Fail-Over</h2>

<p>For now, we will be concerned with redundancy protecting against broken hardware, software slowdown, or crashes inside a single site.</p>

<p>The basic idea is simple: Writes go to all copies; reads that must be repeatable or serializable (i.e., locking) go to the first copy; reads that refer to committed state without guarantee of repeatability can be balanced among all copies. When a copy goes offline, nobody needs to know, as long as there is at least one copy online for each partition. The exception in practice is when there are open cursors or such stateful things as aggregations pending on a copy that goes down. Then the query or transaction will abort and the application can retry. This looks like a deadlock to the application.</p>

<p>Coming back online is more complicated. This requires establishing that the recovering copy is actually in sync. In practice this requires a short window during which no transactions have uncommitted updates. Sometimes, forcing this can require aborting some transactions, which again looks like a deadlock to the application.</p>

<p>When an error is seen, such as a process no longer accepting connections and dropping existing cluster connections, we in practice go via two stages. First, the operations that directly depended on this process are aborted, as well as any computation being done on behalf of the disconnected server. At this stage, attempting to read data from the partition of the failed server will go to another copy but writes will still try to update all copies and will fail if the failed copy continues to be offline. After it is established that the failed copy will stay off for some time, writes may be re-enabled â but now having the failed copy rejoin the cluster will be more complicated, requiring an atomic window to ensure sync, as mentioned earlier.</p>

<p>For the DBA, there can be intermittent software crashes where a failed server automatically restarts itself, and there can be prolonged failures where this does not happen. Both are alerts but the first kind can wait. Since a system must essentially run itself, it will wait for some time for the failed server to restart itself. During this window, all reads of the failed partition go to the spare copy and writes give an error. If the spare does not come back up in time, the system will automatically re-enable writes on the spare but now the failed server may no longer rejoin the cluster without a complex sync cycle. This all can happen in well under a minute, faster than a human operator can react. The diagnostics can be done later.</p>

<p>If the situation was a hardware failure, recovery consists of taking a spare server and copying the database from the surviving online copy. This done, the spare server can come on line. Copying the database can be done while online and accepting updates but this may take some time, maybe an hour for every 200G of data copied over a network. In principle this could be automated by scripting, but we would normally expect a human DBA to be involved.</p>

<p>As a general rule, reacting to the failure goes automatically without disruption of service but bringing the failed copy online will usually require some operator action.</p>

<h2>Levels of Tolerance and Performance</h2>

<p>The only way to make failures totally invisible is to have all in duplicate and provisioned so that the system never runs at more than half the total capacity. This is often not economical or necessary. This is why we can do better, using the spare capacity for more than standby.</p>

<p>Imagine keeping a repository of linked data. Most of the content will come in through periodic bulk replacement of data sets. Some data will come in through pings from applications publishing FOAF and similar. Some data will come through on-demand RDFization of resources.</p>

<p>The performance of such a repository essentially depends on having enough memory. Having this memory in duplicate is just added cost. What we can do instead is have all copies store the whole partition but when routing queries, apply range partitioning on top of the basic hash partitioning. If one partition stores IDs 64K - 128K, the next partition 128K - 192K, and so forth, and all partitions are stored in two full copies, we can route reads to the first 32K IDs to the first copy and reads to the second 32K IDs to the second copy. In this way, the copies will keep different working sets. The RAM is used to full advantage.</p>

<p>Of course, if there is a failure, then the working set will degrade, but if this is not often and not for long, this can be quite tolerable. The alternate expense is buying twice as much RAM, likely meaning twice as many servers. This workload is memory intensive, thus servers should have the maximum memory they can have without going to parts that are so expensive one gets a new server for the price of doubling memory.</p>

<h2>Background Bulk Processing</h2>

<p>When loading data, the system is online in principle, but query response can be quite bad. A large <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x3c0cfb8">RDF</a> load will involve most memory and queries will miss the cache. The load will further keep most disks busy, so response is not good. This is the case as soon as a server&#39;s partition of the database is four times the size of RAM or greater. Whether the work is bulk-load or bulk-delete makes little difference.</p>

<p>But if partitions are replicated, we can temporarily split the database so that the first copies serve queries and the second copies do the load. If the copies serving on line activities do some updates also, these updates will be committed on both copies. But the load will be committed on the second copy only. This is fully appropriate as long as the data are different. When the bulk load is done, the second copy of each partition will have the full up to date state, including changes that came in during the bulk load. The online activity can be now redirected to the second copies and the first copies can be overwritten in the background by the second copies, so as to again have all data in duplicate.</p>

<p>Failures during such operations are not dangerous. If the copies doing the bulk load fail, the bulk load will have to be restarted. If the front end copies fail, the front end load goes to the copies doing the bulk load. Response times will be bad until the bulk load is stopped, but no data is lost.</p>

<p>This technique applies to all data intensive background tasks â calculation of <a href="http://dbpedia.org/resource/Entity" id="link-id0x3b38ac0">entity</a> search ranks, data cleansing, consistency checking, and so on. If two copies are needed to keep up with the online load, then data can be kept just as well in three copies instead of two. This method applies to any data-warehouse-style workload which must coexist with online access and occasional low volume updating.</p>

<h2>Configurations of Redundancy</h2>

<p>Right now, we can declare that two or more server processes in a cluster form a group. All data managed by one member of the group is stored by all others. The members of the group are interchangeable. Thus, if there is four-servers-worth of data, then there will be a minimum of eight servers. Each of these servers will have one server process per core. The first hardware failure will not affect operations. For the second failure, there is a 1/7 chance that it stops the whole system, if it falls on the server whose pair is down. If groups consist of three servers, for a total of 12, the two first failures are guaranteed not to interrupt operations; for the third, there is a 1/10 chance that it will.</p>

<p>We note that for big databases, as said before, the RAM cache capacity is the sum of all the servers&#39; RAM when in normal operation.</p>

<p>There are other, more dynamic ways of splitting data among servers, so that partitions migrate between servers and spawn extra copies of themselves if not enough copies are online. The Google File System (GFS) does something of this sort at the file system level; Amazon&#39;s Dynamo does something similar at the database level. The analogies are not exact, though.</p>

<p>If data is partitioned in this manner, for example into 1K slices, each in duplicate, with the rule that the two duplicates will not be on the same physical server, the first failure will not break operations but the second probably will. Without extra logic, there is a probability that the partitions formerly hosted by the failed server have their second copies randomly spread over the remaining servers. This scheme equalizes load better but is less resilient.</p>

<h2>Maintenance and Continuity</h2>

<p>Databases may benefit from defragmentation, rebalancing of indices, and so on. While these are possible online, by definition they affect the working set and make response times quite bad as soon as the database is significantly larger than RAM. With duplicate copies, the problem is largely solved. Also, software version changes need not involve downtime.</p>

<h2>Present Status</h2>

<p>The basics of replicated partitions are operational. The items to finalize are about system administration procedures and automatic synchronization of recovering copies. This must be automatic because if it is not, the operator will find a way to forget something or do some steps in the wrong order. This also requires a management view that shows what the different processes are doing and whether something is hung or failing repeatedly. All this is for the recovery part; taking failed partitions offline is easy.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2009-03-25#1538">
  <rss:title>Beyond Applications - Introducing the Planetary Datasphere (Part 2)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-03-25T15:50:56Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We have looked at the general implications of the DataSphere, a universal, ubiquitous database infrastructure, on end-user experience and application development and content. Now we will look at what this means at the back end, from hosting to security to server software and hardware. Application Hosting For the infrastructure provider, hosting the DataSphere is no different from hosting large Web 2.0 sites. This may be paid for by users, as in the cloud computing model where users rent capacity for their own purposes, or by advertisers, as in most of Web 2.0. Clouds play a role in this as places with high local connectivity. The DataSphere is the atmosphere; the Cloud is an atmospheric phenomenon. What of Proprietary Data and its Security? Having proprietary data does not imply using a proprietary language. I would say that for any domain of discourse, no matter how private or specialized, at least some structural concepts can be borrowed from public, more generic sources. This lowers training thresholds and facilitates integration. Being able to integrate does not imply opening one&#39;s own data. To take an analogy, if you have a bunker with closed circuit air recycling, you still breathe air, even if that air is cut off from the atmosphere at large. For places with complex existing RDBMS security, the best is to map the RDBMS to RDF on the fly, always running all requests through the RDBMS. This implicitly preserves any policy or label based security schemes. What of Individual Privacy on the Open Web? The more complex situations will be found in environments with mixed security needs, as in social networking with partly-open and partly-closed profiles. The FOAF+SSL solution with https:// URIs is one approach. For query processing, we have a question of enforcing instance-level policies. In the DataSphere, granting privileges on tables and views no longer makes sense. In SQL, a policy means that behind the scenes the DBMS will add extra criteria to queries and updates depending on who is issuing them. The query processor adds conditions like getting the user&#39;s department ID and comparing it to the department ID on the payroll record. Labeled security is a scheme where data rows themselves contain security tags and the DBMS enforces these, row by row. I would say that these techniques are suited for highly-structured situations where the roles, compartments, and needs are clear, and where the organization has the database know-how to write, test, and deploy such rules by the table, row, and column. This does not sit well with schema-last. I would not bet much on an average developer&#39;s capacity for making airtight policies on RDF data where not even 100% schema-adherence is guaranteed. Doing security at the RDF graph level seems more appropriate. In many use cases, the graph is analogous to a photo album or a file system directory. A Data Space can be divided into graphs to provide more granularity for expressing topic, provenance, or security. If policy conditions apply mostly to the graph, then things are not as likely to slip by, for example, policy rules missing some infrequent misuse of the schema. In these cases, the burden on the query processor is also not excessive: Just as with documents, the container (table, graph) is the object of access grants, not the individual sentences (DBMS records, RDF triples) in the document. It is left to the application to present a choice of graph level policies to the user. Exactly what these will be depends on the domain of discourse. A policy might restrict access to a meeting in a calendar to people whose OpenIDs figure in the attendee list, or limit access to a photo album to people mentioned in the owner&#39;s social network. Defining such policies is typically a task for the application developer. The difference between the Document Web and the Linked Data Web is that while the Document Web enforces security when a thing is returned to the user, Linked Data Web enforcement must occur whenever a query references something, even if this is an intermediate result not directly shown to the user. The DataSphere will offer a generic policy scheme, filtering what graphs are accessed in a given query situation. Other applications may then verify the safety of one&#39;s disclosed information using the same DataSphere infrastructure. Of course, the user must rely on the infrastructure provider to correctly enforce these rules. Then again, some users will operate and audit their own infrastructure anyway. Federation vs. Centralization On the open web, there is the question of federation vs. centralization. If an application is seen to be an interface to a vocabulary, it becomes more agnostic with respect to this. In practice, if we are talking about hosted services, what is hosted together joins much faster. Data Spaces with lots of interlinking, such as closely connected social networks, will tend to cluster together on the same cloud to facilitate joint operation. Data is ubiquitous and not location-conscious, but what one can efficiently do with it depends on location. Joint access patterns favor joint location. Due to technicalities of the matter, single database clusters will run complex queries within the cluster 100 to 1000 times faster than between clusters. The size of such data clouds may be in the hundreds-of-billions of triples. It seems to make sense to have data belonging to same-type or jointly-used applications close together. In practice, there will arise partitioning by type of usage, user profile, etc., but this is no longer airtight and applications more-or-less float on top of all of this. A search engine can host a copy of the Document Web and allow text lookups on it. But a text lookup is a single well-defined query that happens to parallelize and partition very well. A search engine can also have all the structured public data copied, but the problem there is that queries are a lot less predictable and may take orders of magnitude more resources than a single text lookup. As a partial answer, even now, we can set up a database so that the first million single-row joins cost the user nothing, but doing more requires a special subscription. The cost for hosting a trillion triples will vary radically in function of what throughput is promised. This may result in pricing per service level, a bit like ISP pricing varies in function of promised connectivity. Queries can be run for free if no throughput guarantee applies, and might cost more if the host promises at least five-million joins-per-second including infrequently-accessed data. Performance and cost dynamics will probably lead to the emergence of domain-specific clusters of colocated Data Spaces. The landscape will be hybrid, where usage drives data colocation. A single Google is not a practical solution to the world&#39;s spectrum of query needs. What is the Cost of Schema-Last? The DataSphere proposition is predicated on a worldwide database fabric that can store anything, just like a network can transport anything. It cannot enforce a fixed schema, just like TCP/IP cannot say that it will transport only email. This is continuous schema evolution. Well, TCP/IP can transport anything but it does transport a lot of HTML and email. Similarly, the DataSphere can optimize for some common vocabularies. We have seen that an application-specific relational schema is often 10 times more efficient than an equivalent completely generic RDF representation of the same thing. The gap may narrow, but task specific representations will keep an edge. We ought to know, as we do both. While anything can be represented, the masses are not that creative. For any data-hosting provider, making a specialized representation for the top 100 entities may cut data size in half or better. This is a behind-the-scenes optimization that will in time be a matter of course. Historically, our industry has been driven by two phenomena: New PCs every 2 years. To make this necessary, Windows has been getting bigger and bigger, and not upgrading is not an option if one must exchange documents with new data formats and keep up with security. Agility, or ad hoc over planned. The reason the RDBMS won over CODASYL network databases was that one did not have to define what queries could be made when creating the database. With the Linked Data Web, we have one more step in this direction when we say that one does not have to decide what can be represented when creating the database. To summarize, there is some cost to schema-last, but then our industry needs more complexity to keep justifying constant investment. The cost is in this sense not all bad. Building the DataSphere may be the next great driver of server demand. As a case in point, Cisco, whose fortune was made when the network became ubiquitous, just entered the server game. It&#39;s in the air. DataSphere Precursors Right now, we have the Linked Open Data movement with lots of new data being added. We have the drive for data- and reputation-portability. We have Freebase as a demonstrator of end-users actually producing structured data. We have convergence of terminology around DBpedia, FOAF, SIOC, and more. We have demonstrators of useful data integration on the RDF stack in diverse fields, especially life sciences. We have a totally ubiquitous network for the distribution of this, plus database technology to make this work. We have a practical need for semantics, as search is getting saturated, email is getting killed by spam, and information overload is a constant. Social networks can be leveraged for solving a lot of this, if they can only be opened. Of course, there is a call for transparency in society at large. Well, the battle of transparency vs. spin is a permanent feature of human existence but even there, we cannot ignore the possibilities of open data. Databases and Servers Technically, what does this take? Mostly, this takes a lot of memory. The software is there and we are productizing it as we speak. As with other data intensive things, the key is scalable querying over clusters of commodity servers. Nothing we have not heard before. Of course, the DBMS must know about RDF specifics to get the right query plans and so on but this we have explained elsewhere. This all comes down to the cost of memory. No amount of CPU or network speed will make any difference if data is not in memory. Right now, a board with 8G and a dual core AMD X86-64 and 4 disks may cost about $700. 2 x 4 core Xeon and 16G and 8 disks may be $4000, counting just the components. In our experience, about 32G per billion triples is a minimum. This must be backed by a few independent disks so as to fill the cache in parallel. A cluster with 1 TB of RAM would be under $100K if built from low end boards. The workload is all about large joins across partitions. The queries parallelize well, thus using the largest and most expensive machines for building blocks is not cost efficient. Having absolutely everything in RAM is also not cost efficient, but it is necessary to have many disks to absorb the random access load. Disk access is predominantly random, unlike some analytics workloads that can read serially. If SSD&#39;s get a bit cheaper, one could have SSD for the database and disk for backup. With large data centers, redundancy becomes an issue. The most cost effective redundancy is simply storing partitions in duplicate or triplicate on different commodity servers. The DBMS software should handle the replication and fail-over. For operating such systems, scaling-on-demand is necessary. Data must move between servers, and adding or replacing servers should be an on-the-fly operation. Also, since access is essentially never uniform, the most commonly accessed partitions may benefit from being kept in more copies than less frequently accessed ones. The DBMS must be essentially self administrating since these things are quite complex and easily intractable if one does not have in depth understanding of this rather complex field. The best price point for hardware varies with time. Right now, the optimum is to have many basic motherboards with maximum memory in a rack unit, then another unit with local disks for each motherboard. Much cheaper than SAN&#39;s and Infiniband fabrics. Conclusions and Next Steps The ingredients and use cases are there. If server clusters with 1TB RAM begin under $100K, the cost of deployment is small compared to personnel costs. Bootstrapping the DataSphere from current Linked Open Data, such as DBpedia, OpenCYC, Freebase, and every sort of social network, is feasible. Aside from private data integration and analytics efforts and E-science, the use cases are liberating social networks and C2C and some aspects of search from silos, overcoming spam, and mass use of semantics extracted from text. Emergent effects will then carry the ball to places we have not yet been. The Linked Data Web has its origins in Semantic Web research, and many of the present participants come from these circles. Things may have been slowed down by a disconnect, only too typical of human activity, between Semantic Web research on one hand and database engineering on the other. Right now, the challenge is one of engineering. As documented on this blog, we have worked quite a bit on cluster databases, mostly but not exclusively with RDF use cases. The actual challenges of this are however not at all what is discussed in Semantic Web conferences. These have to do with complexities of parallelism, timing, message bottlenecks, transactions, and the like, i.e., hardcore engineering. These are difficult beyond what the casual onlooker might guess but not impossible. The details that remain to be worked out are nothing semantic, they are hardcore database, concerning automatic provisioning and such matters. It is as if the Semantic Web people look with envy at the Web 2.0 side where there are big deployments in production, yet they do not seem quite ready to take the step themselves. Well, I will write some other time about research and engineering. For now, the message is &amp;mdash go for it. Stay tuned for more announcements, as we near production with our next generation of software. Related Beyond Applications - Introducing the Planetary Datasphere (Part 1) Serendipitous Discovery Quotient (SDQ) How Linked Data will change Advertising The Time for RDBMS Primacy Downgrade is Nigh! Data Spaces</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://www.openlinksw.com/weblog/oerling/?id=1535" id="link-id155e3bd0">We have looked at the general implications of the DataSphere</a>, a universal, ubiquitous database infrastructure, on end-user experience and application development and content. Now we will look at what this means at the back end, from hosting to security to server software and hardware.</p>

<h2>Application Hosting</h2>

<p>For the infrastructure provider, hosting the DataSphere is no different from hosting large Web 2.0 sites. This may be paid for by users, as in the cloud computing model where users rent capacity for their own purposes, or by advertisers, as in most of Web 2.0.</p>

<p>Clouds play a role in this as places with high local connectivity. The DataSphere is the atmosphere; the Cloud is an atmospheric phenomenon.</p>

<h2>What of Proprietary <a href="http://dbpedia.org/resource/Data" id="link-id0x10fd3e18">Data</a> and its Security?</h2>

<p>Having proprietary data does not imply using a proprietary language. I would say that for any domain of discourse, no matter how private or specialized, at least some structural concepts can be borrowed from public, more generic sources. This lowers training thresholds and facilitates integration. Being able to integrate does not imply opening one&#39;s own data. To take an analogy, if you have a bunker with closed circuit air recycling, you still breathe air, even if that air is cut off from the atmosphere at large. For places with complex existing <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id0x13cae0b0">RDBMS</a> security, the best is to map the RDBMS to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x13deb7d8">RDF</a> on the fly, always running all requests through the RDBMS. This implicitly preserves any policy or label based security schemes.</p>

<h2>What of Individual Privacy on the Open Web?</h2>

<p>The more complex situations will be found in environments with mixed security needs, as in social networking with partly-open and partly-closed profiles. The FOAF+SSL solution with <code>https://</code> URIs is one approach. For query processing, we have a question of enforcing instance-level policies. In the DataSphere, granting privileges on tables and views no longer makes sense. In <a href="http://dbpedia.org/resource/SQL" id="link-id0x1211a490">SQL</a>, a policy means that behind the scenes the DBMS will add extra criteria to queries and updates depending on who is issuing them. The query processor adds conditions like getting the user&#39;s department ID and comparing it to the department ID on the payroll record. Labeled security is a scheme where data rows themselves contain security tags and the DBMS enforces these, row by row.</p>

<p>I would say that these techniques are suited for highly-structured situations where the roles, compartments, and needs are clear, and where the organization has the database know-how to write, test, and deploy such rules by the table, row, and column. This does not sit well with schema-last. I would not bet much on an average developer&#39;s capacity for making airtight policies on RDF data where not even 100% schema-adherence is guaranteed.</p>

<p>Doing security at the RDF graph level seems more appropriate. In many use cases, the graph is analogous to a photo album or a file system directory. A Data <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id0x13beff18">Space</a> can be divided into graphs to provide more granularity for expressing topic, provenance, or security. If policy conditions apply mostly to the graph, then things are not as likely to slip by, for example, policy rules missing some infrequent misuse of the schema. In these cases, the burden on the query processor is also not excessive: Just as with documents, the container (table, graph) is the object of access grants, not the individual sentences (DBMS records, RDF triples) in the document.</p>

<p>It is left to the application to present a choice of graph level policies to the user. Exactly what these will be depends on the domain of discourse. A policy might restrict access to a meeting in a calendar to people whose OpenIDs figure in the attendee list, or limit access to a photo album to people mentioned in the owner&#39;s social network. Defining such policies is typically a task for the application developer.</p>

<p>The difference between the Document Web and the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x13106cd0">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x13ca1050">Web</a> is that while the Document Web enforces security when a thing is returned to the user, Linked Data Web enforcement must occur whenever a query references something, even if this is an intermediate result not directly shown to the user.</p>

<p>The DataSphere will offer a generic policy scheme, filtering what graphs are accessed in a given query situation. Other applications may then verify the safety of one&#39;s disclosed <a href="http://dbpedia.org/resource/Information" id="link-id0x13a02d60">information</a> using the same DataSphere infrastructure. Of course, the user must rely on the infrastructure provider to correctly enforce these rules. Then again, some users will operate and audit their own infrastructure anyway.</p>

<h2>Federation vs. Centralization</h2>

<p>On the open web, there is the question of federation vs. centralization. If an application is seen to be an interface to a vocabulary, it becomes more agnostic with respect to this. In practice, if we are talking about hosted services, what is hosted together joins much faster. Data Spaces with lots of interlinking, such as closely connected social networks, will tend to cluster together on the same cloud to facilitate joint operation. Data is ubiquitous and not location-conscious, but what one can efficiently do with it depends on location. Joint access patterns favor joint location. Due to technicalities of the matter, single database clusters will run complex queries within the cluster 100 to 1000 times faster than between clusters. The size of such data clouds may be in the hundreds-of-billions of triples. It seems to make sense to have data belonging to same-type or jointly-used applications close together. In practice, there will arise partitioning by type of usage, user profile, etc., but this is no longer airtight and applications more-or-less float on top of all of this.</p>

<p>A search engine can host a copy of the Document Web and allow text lookups on it. But a text lookup is a single well-defined query that happens to parallelize and partition very well. A search engine can also have all the structured public data copied, but the problem there is that queries are a lot less predictable and may take orders of magnitude more resources than a single text lookup. As a partial answer, even now, we can set up a database so that the first million single-row joins cost the user nothing, but doing more requires a special subscription.</p>

<p>The cost for hosting a trillion triples will vary radically in function of what throughput is promised. This may result in pricing per service level, a bit like ISP pricing varies in function of promised connectivity. Queries can be run for free if no throughput guarantee applies, and might cost more if the host promises at least five-million joins-per-second including infrequently-accessed data.</p>

<p>Performance and cost dynamics will probably lead to the emergence of domain-specific clusters of colocated Data Spaces. The landscape will be hybrid, where usage drives data colocation. A single Google is not a practical solution to the world&#39;s spectrum of query needs.</p>

<h2>What is the Cost of Schema-Last?</h2>

<p>The DataSphere proposition is predicated on a worldwide database fabric that can store anything, just like a network can transport anything. It cannot enforce a fixed schema, just like TCP/IP cannot say that it will transport only email. This is continuous schema evolution. Well, TCP/IP can transport anything but it does transport a lot of HTML and email. Similarly, the DataSphere can optimize for some common vocabularies.</p>

<p>We have seen that an application-specific relational schema is often 10 times more efficient than an equivalent completely generic RDF representation of the same thing. The gap may narrow, but task specific representations will keep an edge. We ought to know, as we do both.</p>

<p>While anything can be represented, the masses are not that creative. For any data-hosting provider, making a specialized representation for the top 100 entities may cut data size in half or better. This is a behind-the-scenes optimization that will in time be a matter of course.</p>

<p>Historically, our industry has been driven by two phenomena:</p>

<ol>
<li>
  <b>New PCs every 2 years.</b> To make this necessary, Windows has been getting bigger and bigger, and not upgrading is not an option if one must exchange documents with new data formats and keep up with security.</li>

<li>
  <b>Agility, or <i>ad hoc</i> over planned.</b> The reason the RDBMS won over <a href="http://dbpedia.org/resource/CODASYL" id="link-id0x24ee5098">CODASYL</a> network databases was that one did not have to define what queries could be made when creating the database. With the Linked Data Web, we have one more step in this direction when we say that one does not have to decide what can be represented when creating the database.</li>
</ol>

<p>To summarize, there is some cost to schema-last, but then our industry needs more complexity to keep justifying constant investment. The cost is in this sense not all bad.</p>

<p>Building the DataSphere may be the next great driver of server demand. As a case in point, Cisco, whose fortune was made when the network became ubiquitous, just entered the server game. It&#39;s in the air.</p>

<h2>DataSphere Precursors</h2>

<p>Right now, we have the <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id0x13ea7938">Linked Open Data</a> movement with lots of new data being added. We have the drive for data- and reputation-portability. We have Freebase as a demonstrator of end-users actually producing structured data. We have convergence of terminology around <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x13ae45e8">DBpedia</a>, FOAF, SIOC, and more. We have demonstrators of useful data integration on the RDF stack in diverse fields, especially life sciences.</p>

<p>We have a totally ubiquitous network for the distribution of this, plus database technology to make this work.</p>

<p>We have a practical need for semantics, as search is getting saturated, email is getting killed by spam, and information overload is a constant. Social networks can be leveraged for solving a lot of this, if they can only be opened.</p>

<p>Of course, there is a call for transparency in society at large. Well, the battle of transparency vs. spin is a permanent feature of human existence but even there, we cannot ignore the possibilities of open data.</p>

<h2>Databases and Servers</h2>

<p>Technically, what does this take? Mostly, this takes a lot of memory. The software is there and we are productizing it as we speak. As with other data intensive things, the key is scalable querying over clusters of commodity servers. Nothing we have not heard before. Of course, the DBMS must know about RDF specifics to get the right query plans and so on but this we have explained elsewhere.</p>

<p>This all comes down to the cost of memory. No amount of CPU or network speed will make any difference if data is not in memory. Right now, a board with 8G and a dual core AMD X86-64 and 4 disks may cost about $700. 2 x 4 core Xeon and 16G and 8 disks may be $4000, counting just the components. In our experience, about 32G per billion triples is a minimum. This must be backed by a few independent disks so as to fill the cache in parallel. A cluster with 1 TB of RAM would be under $100K if built from low end boards.</p>

<p>The workload is all about large joins across partitions. The queries parallelize well, thus using the largest and most expensive machines for building blocks is not cost efficient. Having absolutely everything in RAM is also not cost efficient, but it is necessary to have many disks to absorb the random access load. Disk access is predominantly random, unlike some analytics workloads that can read serially. If SSD&#39;s get a bit cheaper, one could have SSD for the database and disk for backup.</p>

<p>With large data centers, redundancy becomes an issue. The most cost effective redundancy is simply storing partitions in duplicate or triplicate on different commodity servers. The DBMS software should handle the replication and fail-over.</p>

<p>For operating such systems, scaling-on-demand is necessary. Data must move between servers, and adding or replacing servers should be an on-the-fly operation. Also, since access is essentially never uniform, the most commonly accessed partitions may benefit from being kept in more copies than less frequently accessed ones. The DBMS must be essentially self administrating since these things are quite complex and easily intractable if one does not have in depth understanding of this rather complex field.</p>

<p>The best price point for hardware varies with time. Right now, the optimum is to have many basic motherboards with maximum memory in a rack unit, then another unit with local disks for each motherboard. Much cheaper than SAN&#39;s and Infiniband fabrics.</p>

<h2>Conclusions and Next Steps</h2>

<p>The ingredients and use cases are there. If server clusters with 1TB RAM begin under $100K, the cost of deployment is small compared to personnel costs.</p>

<p>Bootstrapping the DataSphere from current Linked Open Data, such as DBpedia, <a href="http://dbpedia.org/resource/Cyc" id="link-id0x13c36da8">OpenCYC</a>, Freebase, and every sort of social network, is feasible. Aside from private data integration and analytics efforts and E-science, the use cases are liberating social networks and C2C and some aspects of search from silos, overcoming spam, and mass use of semantics extracted from text. Emergent effects will then carry the ball to places we have not yet been.</p>

<p>The Linked Data Web has its origins in <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0x1405f0a8">Semantic Web</a> research, and many of the present participants come from these circles. Things may have been slowed down by a disconnect, only too typical of human activity, between Semantic Web research on one hand and database engineering on the other. Right now, the challenge is one of engineering. As documented on this <a href="http://dbpedia.org/resource/Blog" id="link-id0x2329f1a8">blog</a>, we have worked quite a bit on cluster databases, mostly but not exclusively with RDF use cases. The actual challenges of this are however not at all what is discussed in Semantic Web conferences. These have to do with complexities of parallelism, timing, message bottlenecks, transactions, and the like, i.e., hardcore engineering. These are difficult beyond what the casual onlooker might guess but not impossible. The details that remain to be worked out are nothing semantic, they are hardcore database, concerning automatic provisioning and such matters.</p>

<p>It is as if the Semantic Web people look with envy at the Web 2.0 side where there are big deployments in production, yet they do not seem quite ready to take the step themselves. Well, I will write some other time about research and engineering. For now, the message is &amp;mdash <i><b>go for it</b></i>. Stay tuned for more announcements, as we near production with our next generation of software.</p>


<h2>Related</h2>
<ul>
<li>
  <a href="http://www.openlinksw.com/weblog/oerling/?id=1535" id="link-id14e02bb0">Beyond Applications - Introducing the Planetary Datasphere (Part 1)</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1442" id="link-id117dc518">Serendipitous Discovery Quotient (SDQ)</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1534" id="link-id15c52410">How Linked Data will change Advertising</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1519" id="link-id11e93658">The Time for RDBMS Primacy Downgrade is Nigh!</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?tag=DataSpace" id="link-id1491a588">Data Spaces</a>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2009-03-25#1537">
  <rss:title>Beyond Applications - Introducing the Planetary Datasphere (Part 2)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-03-25T15:50:56Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We have looked at the general implications of the DataSphere, a universal, ubiquitous database infrastructure, on end-user experience and application development and content. Now we will look at what this means at the back end, from hosting to security to server software and hardware. Application Hosting For the infrastructure provider, hosting the DataSphere is no different from hosting large Web 2.0 sites. This may be paid for by users, as in the cloud computing model where users rent capacity for their own purposes, or by advertisers, as in most of Web 2.0. Clouds play a role in this as places with high local connectivity. The DataSphere is the atmosphere; the Cloud is an atmospheric phenomenon. What of Proprietary Data and its Security? Having proprietary data does not imply using a proprietary language. I would say that for any domain of discourse, no matter how private or specialized, at least some structural concepts can be borrowed from public, more generic sources. This lowers training thresholds and facilitates integration. Being able to integrate does not imply opening one&#39;s own data. To take an analogy, if you have a bunker with closed circuit air recycling, you still breathe air, even if that air is cut off from the atmosphere at large. For places with complex existing RDBMS security, the best is to map the RDBMS to RDF on the fly, always running all requests through the RDBMS. This implicitly preserves any policy or label based security schemes. What of Individual Privacy on the Open Web? The more complex situations will be found in environments with mixed security needs, as in social networking with partly-open and partly-closed profiles. The FOAF+SSL solution with https:// URIs is one approach. For query processing, we have a question of enforcing instance-level policies. In the DataSphere, granting privileges on tables and views no longer makes sense. In SQL, a policy means that behind the scenes the DBMS will add extra criteria to queries and updates depending on who is issuing them. The query processor adds conditions like getting the user&#39;s department ID and comparing it to the department ID on the payroll record. Labeled security is a scheme where data rows themselves contain security tags and the DBMS enforces these, row by row. I would say that these techniques are suited for highly-structured situations where the roles, compartments, and needs are clear, and where the organization has the database know-how to write, test, and deploy such rules by the table, row, and column. This does not sit well with schema-last. I would not bet much on an average developer&#39;s capacity for making airtight policies on RDF data where not even 100% schema-adherence is guaranteed. Doing security at the RDF graph level seems more appropriate. In many use cases, the graph is analogous to a photo album or a file system directory. A Data Space can be divided into graphs to provide more granularity for expressing topic, provenance, or security. If policy conditions apply mostly to the graph, then things are not as likely to slip by, for example, policy rules missing some infrequent misuse of the schema. In these cases, the burden on the query processor is also not excessive: Just as with documents, the container (table, graph) is the object of access grants, not the individual sentences (DBMS records, RDF triples) in the document. It is left to the application to present a choice of graph level policies to the user. Exactly what these will be depends on the domain of discourse. A policy might restrict access to a meeting in a calendar to people whose OpenIDs figure in the attendee list, or limit access to a photo album to people mentioned in the owner&#39;s social network. Defining such policies is typically a task for the application developer. The difference between the Document Web and the Linked Data Web is that while the Document Web enforces security when a thing is returned to the user, Linked Data Web enforcement must occur whenever a query references something, even if this is an intermediate result not directly shown to the user. The DataSphere will offer a generic policy scheme, filtering what graphs are accessed in a given query situation. Other applications may then verify the safety of one&#39;s disclosed information using the same DataSphere infrastructure. Of course, the user must rely on the infrastructure provider to correctly enforce these rules. Then again, some users will operate and audit their own infrastructure anyway. Federation vs. Centralization On the open web, there is the question of federation vs. centralization. If an application is seen to be an interface to a vocabulary, it becomes more agnostic with respect to this. In practice, if we are talking about hosted services, what is hosted together joins much faster. Data Spaces with lots of interlinking, such as closely connected social networks, will tend to cluster together on the same cloud to facilitate joint operation. Data is ubiquitous and not location-conscious, but what one can efficiently do with it depends on location. Joint access patterns favor joint location. Due to technicalities of the matter, single database clusters will run complex queries within the cluster 100 to 1000 times faster than between clusters. The size of such data clouds may be in the hundreds-of-billions of triples. It seems to make sense to have data belonging to same-type or jointly-used applications close together. In practice, there will arise partitioning by type of usage, user profile, etc., but this is no longer airtight and applications more-or-less float on top of all of this. A search engine can host a copy of the Document Web and allow text lookups on it. But a text lookup is a single well-defined query that happens to parallelize and partition very well. A search engine can also have all the structured public data copied, but the problem there is that queries are a lot less predictable and may take orders of magnitude more resources than a single text lookup. As a partial answer, even now, we can set up a database so that the first million single-row joins cost the user nothing, but doing more requires a special subscription. The cost for hosting a trillion triples will vary radically in function of what throughput is promised. This may result in pricing per service level, a bit like ISP pricing varies in function of promised connectivity. Queries can be run for free if no throughput guarantee applies, and might cost more if the host promises at least five-million joins-per-second including infrequently-accessed data. Performance and cost dynamics will probably lead to the emergence of domain-specific clusters of colocated Data Spaces. The landscape will be hybrid, where usage drives data colocation. A single Google is not a practical solution to the world&#39;s spectrum of query needs. What is the Cost of Schema-Last? The DataSphere proposition is predicated on a worldwide database fabric that can store anything, just like a network can transport anything. It cannot enforce a fixed schema, just like TCP/IP cannot say that it will transport only email. This is continuous schema evolution. Well, TCP/IP can transport anything but it does transport a lot of HTML and email. Similarly, the DataSphere can optimize for some common vocabularies. We have seen that an application-specific relational schema is often 10 times more efficient than an equivalent completely generic RDF representation of the same thing. The gap may narrow, but task specific representations will keep an edge. We ought to know, as we do both. While anything can be represented, the masses are not that creative. For any data-hosting provider, making a specialized representation for the top 100 entities may cut data size in half or better. This is a behind-the-scenes optimization that will in time be a matter of course. Historically, our industry has been driven by two phenomena: New PCs every 2 years. To make this necessary, Windows has been getting bigger and bigger, and not upgrading is not an option if one must exchange documents with new data formats and keep up with security. Agility, or ad hoc over planned. The reason the RDBMS won over CODASYL network databases was that one did not have to define what queries could be made when creating the database. With the Linked Data Web, we have one more step in this direction when we say that one does not have to decide what can be represented when creating the database. To summarize, there is some cost to schema-last, but then our industry needs more complexity to keep justifying constant investment. The cost is in this sense not all bad. Building the DataSphere may be the next great driver of server demand. As a case in point, Cisco, whose fortune was made when the network became ubiquitous, just entered the server game. It&#39;s in the air. DataSphere Precursors Right now, we have the Linked Open Data movement with lots of new data being added. We have the drive for data- and reputation-portability. We have Freebase as a demonstrator of end-users actually producing structured data. We have convergence of terminology around DBpedia, FOAF, SIOC, and more. We have demonstrators of useful data integration on the RDF stack in diverse fields, especially life sciences. We have a totally ubiquitous network for the distribution of this, plus database technology to make this work. We have a practical need for semantics, as search is getting saturated, email is getting killed by spam, and information overload is a constant. Social networks can be leveraged for solving a lot of this, if they can only be opened. Of course, there is a call for transparency in society at large. Well, the battle of transparency vs. spin is a permanent feature of human existence but even there, we cannot ignore the possibilities of open data. Databases and Servers Technically, what does this take? Mostly, this takes a lot of memory. The software is there and we are productizing it as we speak. As with other data intensive things, the key is scalable querying over clusters of commodity servers. Nothing we have not heard before. Of course, the DBMS must know about RDF specifics to get the right query plans and so on but this we have explained elsewhere. This all comes down to the cost of memory. No amount of CPU or network speed will make any difference if data is not in memory. Right now, a board with 8G and a dual core AMD X86-64 and 4 disks may cost about $700. 2 x 4 core Xeon and 16G and 8 disks may be $4000, counting just the components. In our experience, about 32G per billion triples is a minimum. This must be backed by a few independent disks so as to fill the cache in parallel. A cluster with 1 TB of RAM would be under $100K if built from low end boards. The workload is all about large joins across partitions. The queries parallelize well, thus using the largest and most expensive machines for building blocks is not cost efficient. Having absolutely everything in RAM is also not cost efficient, but it is necessary to have many disks to absorb the random access load. Disk access is predominantly random, unlike some analytics workloads that can read serially. If SSD&#39;s get a bit cheaper, one could have SSD for the database and disk for backup. With large data centers, redundancy becomes an issue. The most cost effective redundancy is simply storing partitions in duplicate or triplicate on different commodity servers. The DBMS software should handle the replication and fail-over. For operating such systems, scaling-on-demand is necessary. Data must move between servers, and adding or replacing servers should be an on-the-fly operation. Also, since access is essentially never uniform, the most commonly accessed partitions may benefit from being kept in more copies than less frequently accessed ones. The DBMS must be essentially self administrating since these things are quite complex and easily intractable if one does not have in depth understanding of this rather complex field. The best price point for hardware varies with time. Right now, the optimum is to have many basic motherboards with maximum memory in a rack unit, then another unit with local disks for each motherboard. Much cheaper than SAN&#39;s and Infiniband fabrics. Conclusions and Next Steps The ingredients and use cases are there. If server clusters with 1TB RAM begin under $100K, the cost of deployment is small compared to personnel costs. Bootstrapping the DataSphere from current Linked Open Data, such as DBpedia, OpenCYC, Freebase, and every sort of social network, is feasible. Aside from private data integration and analytics efforts and E-science, the use cases are liberating social networks and C2C and some aspects of search from silos, overcoming spam, and mass use of semantics extracted from text. Emergent effects will then carry the ball to places we have not yet been. The Linked Data Web has its origins in Semantic Web research, and many of the present participants come from these circles. Things may have been slowed down by a disconnect, only too typical of human activity, between Semantic Web research on one hand and database engineering on the other. Right now, the challenge is one of engineering. As documented on this blog, we have worked quite a bit on cluster databases, mostly but not exclusively with RDF use cases. The actual challenges of this are however not at all what is discussed in Semantic Web conferences. These have to do with complexities of parallelism, timing, message bottlenecks, transactions, and the like, i.e., hardcore engineering. These are difficult beyond what the casual onlooker might guess but not impossible. The details that remain to be worked out are nothing semantic, they are hardcore database, concerning automatic provisioning and such matters. It is as if the Semantic Web people look with envy at the Web 2.0 side where there are big deployments in production, yet they do not seem quite ready to take the step themselves. Well, I will write some other time about research and engineering. For now, the message is &amp;mdash go for it. Stay tuned for more announcements, as we near production with our next generation of software. Related Beyond Applications - Introducing the Planetary Datasphere (Part 1) Serendipitous Discovery Quotient (SDQ) How Linked Data will change Advertising The Time for RDBMS Primacy Downgrade is Nigh! Data Spaces</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://www.openlinksw.com/weblog/oerling/?id=1535" id="link-id155e3bd0">We have looked at the general implications of the DataSphere</a>, a universal, ubiquitous database infrastructure, on end-user experience and application development and content. Now we will look at what this means at the back end, from hosting to security to server software and hardware.</p>

<h2>Application Hosting</h2>

<p>For the infrastructure provider, hosting the DataSphere is no different from hosting large Web 2.0 sites. This may be paid for by users, as in the cloud computing model where users rent capacity for their own purposes, or by advertisers, as in most of Web 2.0.</p>

<p>Clouds play a role in this as places with high local connectivity. The DataSphere is the atmosphere; the Cloud is an atmospheric phenomenon.</p>

<h2>What of Proprietary <a href="http://dbpedia.org/resource/Data" id="link-id0x13b5b4a0">Data</a> and its Security?</h2>

<p>Having proprietary data does not imply using a proprietary language. I would say that for any domain of discourse, no matter how private or specialized, at least some structural concepts can be borrowed from public, more generic sources. This lowers training thresholds and facilitates integration. Being able to integrate does not imply opening one&#39;s own data. To take an analogy, if you have a bunker with closed circuit air recycling, you still breathe air, even if that air is cut off from the atmosphere at large. For places with complex existing <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id0x24db80e0">RDBMS</a> security, the best is to map the RDBMS to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x24ea7c40">RDF</a> on the fly, always running all requests through the RDBMS. This implicitly preserves any policy or label based security schemes.</p>

<h2>What of Individual Privacy on the Open Web?</h2>

<p>The more complex situations will be found in environments with mixed security needs, as in social networking with partly-open and partly-closed profiles. The FOAF+SSL solution with <code>https://</code> URIs is one approach. For query processing, we have a question of enforcing instance-level policies. In the DataSphere, granting privileges on tables and views no longer makes sense. In <a href="http://dbpedia.org/resource/SQL" id="link-id0x24aaccc0">SQL</a>, a policy means that behind the scenes the DBMS will add extra criteria to queries and updates depending on who is issuing them. The query processor adds conditions like getting the user&#39;s department ID and comparing it to the department ID on the payroll record. Labeled security is a scheme where data rows themselves contain security tags and the DBMS enforces these, row by row.</p>

<p>I would say that these techniques are suited for highly-structured situations where the roles, compartments, and needs are clear, and where the organization has the database know-how to write, test, and deploy such rules by the table, row, and column. This does not sit well with schema-last. I would not bet much on an average developer&#39;s capacity for making airtight policies on RDF data where not even 100% schema-adherence is guaranteed.</p>

<p>Doing security at the RDF graph level seems more appropriate. In many use cases, the graph is analogous to a photo album or a file system directory. A Data <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id0x2396c058">Space</a> can be divided into graphs to provide more granularity for expressing topic, provenance, or security. If policy conditions apply mostly to the graph, then things are not as likely to slip by, for example, policy rules missing some infrequent misuse of the schema. In these cases, the burden on the query processor is also not excessive: Just as with documents, the container (table, graph) is the object of access grants, not the individual sentences (DBMS records, RDF triples) in the document.</p>

<p>It is left to the application to present a choice of graph level policies to the user. Exactly what these will be depends on the domain of discourse. A policy might restrict access to a meeting in a calendar to people whose OpenIDs figure in the attendee list, or limit access to a photo album to people mentioned in the owner&#39;s social network. Defining such policies is typically a task for the application developer.</p>

<p>The difference between the Document Web and the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x238a0098">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0x23882280">Web</a> is that while the Document Web enforces security when a thing is returned to the user, Linked Data Web enforcement must occur whenever a query references something, even if this is an intermediate result not directly shown to the user.</p>

<p>The DataSphere will offer a generic policy scheme, filtering what graphs are accessed in a given query situation. Other applications may then verify the safety of one&#39;s disclosed <a href="http://dbpedia.org/resource/Information" id="link-id0x2388e458">information</a> using the same DataSphere infrastructure. Of course, the user must rely on the infrastructure provider to correctly enforce these rules. Then again, some users will operate and audit their own infrastructure anyway.</p>

<h2>Federation vs. Centralization</h2>

<p>On the open web, there is the question of federation vs. centralization. If an application is seen to be an interface to a vocabulary, it becomes more agnostic with respect to this. In practice, if we are talking about hosted services, what is hosted together joins much faster. Data Spaces with lots of interlinking, such as closely connected social networks, will tend to cluster together on the same cloud to facilitate joint operation. Data is ubiquitous and not location-conscious, but what one can efficiently do with it depends on location. Joint access patterns favor joint location. Due to technicalities of the matter, single database clusters will run complex queries within the cluster 100 to 1000 times faster than between clusters. The size of such data clouds may be in the hundreds-of-billions of triples. It seems to make sense to have data belonging to same-type or jointly-used applications close together. In practice, there will arise partitioning by type of usage, user profile, etc., but this is no longer airtight and applications more-or-less float on top of all of this.</p>

<p>A search engine can host a copy of the Document Web and allow text lookups on it. But a text lookup is a single well-defined query that happens to parallelize and partition very well. A search engine can also have all the structured public data copied, but the problem there is that queries are a lot less predictable and may take orders of magnitude more resources than a single text lookup. As a partial answer, even now, we can set up a database so that the first million single-row joins cost the user nothing, but doing more requires a special subscription.</p>

<p>The cost for hosting a trillion triples will vary radically in function of what throughput is promised. This may result in pricing per service level, a bit like ISP pricing varies in function of promised connectivity. Queries can be run for free if no throughput guarantee applies, and might cost more if the host promises at least five-million joins-per-second including infrequently-accessed data.</p>

<p>Performance and cost dynamics will probably lead to the emergence of domain-specific clusters of colocated Data Spaces. The landscape will be hybrid, where usage drives data colocation. A single Google is not a practical solution to the world&#39;s spectrum of query needs.</p>

<h2>What is the Cost of Schema-Last?</h2>

<p>The DataSphere proposition is predicated on a worldwide database fabric that can store anything, just like a network can transport anything. It cannot enforce a fixed schema, just like TCP/IP cannot say that it will transport only email. This is continuous schema evolution. Well, TCP/IP can transport anything but it does transport a lot of HTML and email. Similarly, the DataSphere can optimize for some common vocabularies.</p>

<p>We have seen that an application-specific relational schema is often 10 times more efficient than an equivalent completely generic RDF representation of the same thing. The gap may narrow, but task specific representations will keep an edge. We ought to know, as we do both.</p>

<p>While anything can be represented, the masses are not that creative. For any data-hosting provider, making a specialized representation for the top 100 entities may cut data size in half or better. This is a behind-the-scenes optimization that will in time be a matter of course.</p>

<p>Historically, our industry has been driven by two phenomena:</p>

<ol>
<li>
  <b>New PCs every 2 years.</b> To make this necessary, Windows has been getting bigger and bigger, and not upgrading is not an option if one must exchange documents with new data formats and keep up with security.</li>

<li>
  <b>Agility, or <i>ad hoc</i> over planned.</b> The reason the RDBMS won over <a href="http://dbpedia.org/resource/CODASYL" id="link-id0x13b23460">CODASYL</a> network databases was that one did not have to define what queries could be made when creating the database. With the Linked Data Web, we have one more step in this direction when we say that one does not have to decide what can be represented when creating the database.</li>
</ol>

<p>To summarize, there is some cost to schema-last, but then our industry needs more complexity to keep justifying constant investment. The cost is in this sense not all bad.</p>

<p>Building the DataSphere may be the next great driver of server demand. As a case in point, Cisco, whose fortune was made when the network became ubiquitous, just entered the server game. It&#39;s in the air.</p>

<h2>DataSphere Precursors</h2>

<p>Right now, we have the <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id0x236a9be8">Linked Open Data</a> movement with lots of new data being added. We have the drive for data- and reputation-portability. We have Freebase as a demonstrator of end-users actually producing structured data. We have convergence of terminology around <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x24db8350">DBpedia</a>, FOAF, SIOC, and more. We have demonstrators of useful data integration on the RDF stack in diverse fields, especially life sciences.</p>

<p>We have a totally ubiquitous network for the distribution of this, plus database technology to make this work.</p>

<p>We have a practical need for semantics, as search is getting saturated, email is getting killed by spam, and information overload is a constant. Social networks can be leveraged for solving a lot of this, if they can only be opened.</p>

<p>Of course, there is a call for transparency in society at large. Well, the battle of transparency vs. spin is a permanent feature of human existence but even there, we cannot ignore the possibilities of open data.</p>

<h2>Databases and Servers</h2>

<p>Technically, what does this take? Mostly, this takes a lot of memory. The software is there and we are productizing it as we speak. As with other data intensive things, the key is scalable querying over clusters of commodity servers. Nothing we have not heard before. Of course, the DBMS must know about RDF specifics to get the right query plans and so on but this we have explained elsewhere.</p>

<p>This all comes down to the cost of memory. No amount of CPU or network speed will make any difference if data is not in memory. Right now, a board with 8G and a dual core AMD X86-64 and 4 disks may cost about $700. 2 x 4 core Xeon and 16G and 8 disks may be $4000, counting just the components. In our experience, about 32G per billion triples is a minimum. This must be backed by a few independent disks so as to fill the cache in parallel. A cluster with 1 TB of RAM would be under $100K if built from low end boards.</p>

<p>The workload is all about large joins across partitions. The queries parallelize well, thus using the largest and most expensive machines for building blocks is not cost efficient. Having absolutely everything in RAM is also not cost efficient, but it is necessary to have many disks to absorb the random access load. Disk access is predominantly random, unlike some analytics workloads that can read serially. If SSD&#39;s get a bit cheaper, one could have SSD for the database and disk for backup.</p>

<p>With large data centers, redundancy becomes an issue. The most cost effective redundancy is simply storing partitions in duplicate or triplicate on different commodity servers. The DBMS software should handle the replication and fail-over.</p>

<p>For operating such systems, scaling-on-demand is necessary. Data must move between servers, and adding or replacing servers should be an on-the-fly operation. Also, since access is essentially never uniform, the most commonly accessed partitions may benefit from being kept in more copies than less frequently accessed ones. The DBMS must be essentially self administrating since these things are quite complex and easily intractable if one does not have in depth understanding of this rather complex field.</p>

<p>The best price point for hardware varies with time. Right now, the optimum is to have many basic motherboards with maximum memory in a rack unit, then another unit with local disks for each motherboard. Much cheaper than SAN&#39;s and Infiniband fabrics.</p>

<h2>Conclusions and Next Steps</h2>

<p>The ingredients and use cases are there. If server clusters with 1TB RAM begin under $100K, the cost of deployment is small compared to personnel costs.</p>

<p>Bootstrapping the DataSphere from current Linked Open Data, such as DBpedia, <a href="http://dbpedia.org/resource/Cyc" id="link-id0x2396a038">OpenCYC</a>, Freebase, and every sort of social network, is feasible. Aside from private data integration and analytics efforts and E-science, the use cases are liberating social networks and C2C and some aspects of search from silos, overcoming spam, and mass use of semantics extracted from text. Emergent effects will then carry the ball to places we have not yet been.</p>

<p>The Linked Data Web has its origins in <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0x13ea7110">Semantic Web</a> research, and many of the present participants come from these circles. Things may have been slowed down by a disconnect, only too typical of human activity, between Semantic Web research on one hand and database engineering on the other. Right now, the challenge is one of engineering. As documented on this <a href="http://dbpedia.org/resource/Blog" id="link-id0x2388e368">blog</a>, we have worked quite a bit on cluster databases, mostly but not exclusively with RDF use cases. The actual challenges of this are however not at all what is discussed in Semantic Web conferences. These have to do with complexities of parallelism, timing, message bottlenecks, transactions, and the like, i.e., hardcore engineering. These are difficult beyond what the casual onlooker might guess but not impossible. The details that remain to be worked out are nothing semantic, they are hardcore database, concerning automatic provisioning and such matters.</p>

<p>It is as if the Semantic Web people look with envy at the Web 2.0 side where there are big deployments in production, yet they do not seem quite ready to take the step themselves. Well, I will write some other time about research and engineering. For now, the message is &amp;mdash <i><b>go for it</b></i>. Stay tuned for more announcements, as we near production with our next generation of software.</p>


<h2>Related</h2>
<ul>
<li>
  <a href="http://www.openlinksw.com/weblog/oerling/?id=1535" id="link-id14e02bb0">Beyond Applications - Introducing the Planetary Datasphere (Part 1)</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1442" id="link-id117dc518">Serendipitous Discovery Quotient (SDQ)</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1534" id="link-id15c52410">How Linked Data will change Advertising</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?id=1519" id="link-id11e93658">The Time for RDBMS Primacy Downgrade is Nigh!</a>
</li>
<li>
  <a href="http://www.openlinksw.com/blog/~kidehen/?tag=DataSpace" id="link-id1491a588">Data Spaces</a>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-03-14#1531">
  <rss:title>Simple Compare &amp; Contrast of Web 1.0, 2.0, and 3.0 (Update 1)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-03-14T18:20:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Here is a tabulated &quot;compare and contrast&quot; of Web usage patterns 1.0, 2.0, and 3.0. Â  Web 1.0 Web 2.0 Web 3.0 Simple Definition Interactive / Visual Web Programmable Web Linked Data Web Unit of Presence Web Page Web Service Endpoint Data Space (named structured data enclave) Unit of Value Exchange Page URL Endpoint URL for API Resource / Entity / Object URI Data Granularity Low (HTML) Medium (XML) High (RDF) Defining Services Search Community (Blogs to Social Networks) Find Participation Quotient Low Medium High Serendipitous Discovery Quotient Low Medium High Data Referencability Quotient Low (Documents) Medium (Documents) High (Documents and their constituent Data) Subjectivity Quotient High Medium (from A-list bloggers to select source and partner lists) Low (everything is discovered via URIs) Transclusence Low Medium (Code driven Mashups) HIgh (Data driven Meshups) What You See Is What You Prefer (WYSIWYP) Low Medium High (negotiated representation of resource descriptions) Open Data Access (Data Accessibility) Low Medium (Silos) High (no Silos) Identity Issues Handling Low Medium (OpenID) High (FOAF+SSL) Solution Deployment Model Centralized Centralized with sprinklings of Federation Federated with function specific Centralization (e.g. Lookup hubs like LOD Cloud or DBpedia) Data Model Orientation Logical (Tree based DOM) Logical (Tree based XML) Conceptual (Graph based RDF) User Interface Issues Dynamically generated static interfaces Dyanically generated interafaces with semi-dynamic interfaces (courtesy of XSLT or XQuery/XPath) Dynamic Interfaces (pre- and post-generation) courtesy of self-describing nature of RDF Data Querying Full Text Search Full Text Search Full Text Search + Structured Graph Pattern Query Language (SPARQL) What Each Delivers Democratized Publishing Democratized Journalism &amp; Commentary (Citizen Journalists &amp; Commentators) Democratized Analysis (Citizen Data Analysts) Star Wars Edition Analogy Star Wars (original fight for decentralization via rebellion) Empire Strikes Back (centralization and data silos make comeback) Return of the JEDI (FORCE emerges and facilitates decentralization from &quot;Identity&quot; all the way to &quot;Open Data Access&quot; and &quot;Negotiable Descriptive Data Representation&quot;) Naturally, I am not expecting everyone to agree with me. I am simply making my contribution to what will remain facinating discourse for a long time to come :-) Related Web 3.0 The Best Official Definition Imaginable -- Nova Spivack&#39;s</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p>Here is a tabulated &quot;compare and contrast&quot; of <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> usage patterns 1.0, 2.0, and 3.0.</p>  <table border="1" width="715" height="286">    <tbody>
  <tr>      <td>Â </td>      <td><strong>Web 1.0</strong></td>      <td><strong>Web 2.0</strong></td>      <td><strong>Web 3.0</strong></td>   </tr>    <tr>      <td><strong>Simple Definition</strong></td>      <td>Interactive / Visual Web</td>      <td>Programmable Web</td>      <td><a href="http://dbpedia.org/resource/Linked_Data" id="link-id117a9a98">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id146bcdb0">Web</a></td>   </tr>    <tr>      <td><strong>Unit of Presence</strong></td>      <td>Web Page</td>      <td>Web Service Endpoint</td>      <td><a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id11a66c60">Data Space</a> (named structured data enclave)</td>   </tr>    <tr>      <td><strong>Unit of Value Exchange</strong></td>      <td>Page <a href="http://dbpedia.org/resource/Uniform_Resource_Locator" id="link-id146083f8">URL</a></td>      <td>Endpoint URL for API</td>      <td>Resource / <a href="http://dbpedia.org/resource/Entity" id="link-id121b2148">Entity</a> / Object <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id1467ed00">URI</a></td>   </tr>    <tr>      <td><strong>Data Granularity</strong></td>      <td>Low (HTML)</td>      <td>Medium (XML)</td>      <td>High (RDF)</td>   </tr>    <tr>      <td><strong>Defining Services</strong></td>      <td>Search </td>      <td>Community (Blogs to Social Networks) </td>      <td>Find</td>   </tr>    <tr>      <td><strong>Participation Quotient</strong></td>      <td>Low</td>      <td>Medium</td>      <td>High</td>   </tr>    <tr>      <td><strong>Serendipitous Discovery Quotient </strong></td>      <td>Low</td>      <td>Medium</td>      <td>High</td>   </tr>    <tr>      <td><strong>Data Referencability Quotient </strong></td>      <td>Low (Documents)</td>      <td>Medium (Documents)</td>      <td>High (Documents and their constituent Data)</td>   </tr>    <tr>      <td><strong>Subjectivity Quotient</strong></td>      <td>High</td>      <td>Medium (from A-list bloggers to select source and partner lists)</td>      <td>Low (everything is discovered via URIs)</td>   </tr>    <tr>      <td>    <strong><a href="http://dbpedia.org/resource/Transclusion" id="link-id155308d8">Transclusence</a>    </strong></td>      <td>Low</td>      <td>Medium (Code driven Mashups)</td>      <td>HIgh (Data driven Meshups)</td>   </tr>    <tr>      <td><strong>What You See Is What You Prefer (WYSIWYP)</strong></td>      <td>Low</td>      <td>Medium </td>      <td>High (negotiated representation of resource descriptions)</td>   </tr>    <tr>      <td><strong>Open Data Access (Data Accessibility)</strong></td>      <td>Low</td>      <td>Medium (Silos)</td>      <td>High (no Silos)</td>   </tr>    <tr>      <td><strong>Identity Issues Handling</strong></td>      <td>Low</td>      <td>Medium (<a href="http://dbpedia.org/resource/OpenID" id="link-id119d77f8">OpenID</a>)</td>      <td><p>High (<a href="http://esw.w3.org/topic/foaf+ssl" id="link-id135cc348">FOAF+SSL</a>)</p></td>   </tr>    <tr>      <td><strong>Solution Deployment Model</strong></td>      <td>Centralized</td>      <td>Centralized with sprinklings of Federation</td>      <td>Federated with function specific Centralization (e.g. Lookup hubs like <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id1496d1d0">LOD</a> Cloud or <a href="http://dbpedia.org/resource/DBpedia" id="link-id1571f690">DBpedia</a>)</td>   </tr>   <tr>     <td><strong>Data Model Orientation</strong></td>     <td>Logical (Tree based DOM)</td>     <td>Logical (Tree based XML)</td>     <td>Conceptual (Graph based RDF)</td>   </tr>   <tr>     <td><strong>User Interface Issues</strong></td>     <td>Dynamically generated static interfaces</td>     <td>Dyanically generated interafaces with semi-dynamic interfaces (courtesy of XSLT or <a href="http://dbpedia.org/resource/XQuery" id="link-id118399e8">XQuery</a>/<a href="http://dbpedia.org/resource/XPath" id="link-id14b00ba0">XPath</a>)</td>     <td>Dynamic Interfaces (pre- and post-generation) courtesy of self-describing nature of RDF</td>   </tr>   <tr>     <td><strong>Data Querying</strong></td>     <td><a href="http://dbpedia.org/resource/Full_text_search" id="link-id14fdd948">Full Text Search</a></td>     <td>Full Text Search</td>     <td>Full Text Search + Structured Graph Pattern Query Language (<a href="http://dbpedia.org/resource/SPARQL" id="link-id154a9368">SPARQL</a>)</td>   </tr>   <tr>     <td><strong>What Each Delivers</strong></td>     <td>Democratized Publishing</td>     <td>Democratized Journalism &amp; Commentary (Citizen Journalists &amp; Commentators)</td>     <td>Democratized Analysis (Citizen Data Analysts)</td>   </tr>     <tr>     <td>    <strong><a href="http://dbpedia.org/resource/Star_Wars" id="link-id155ce920">Star Wars Edition Analogy</a>    </strong></td>     <td>Star Wars (original fight for decentralization via rebellion)</td>     <td>Empire Strikes Back (centralization and data silos make comeback)</td>     <td>Return of the JEDI (<a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1474" id="link-id11706640">FORCE</a> emerges and facilitates decentralization from &quot;Identity&quot; all the way to &quot;Open Data Access&quot; and &quot;Negotiable Descriptive Data Representation&quot;)</td>   </tr> </tbody>
</table>  <p>Naturally, I am not expecting everyone to agree with <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id15be20c0">me</a>. I am simply making my contribution to what will remain facinating discourse for a long time to come :-)</p>  <h3>Related</h3>  <ul>    <li>    <a href="http://novaspivack.typepad.com/nova_spivacks_weblog/2007/10/web-30----the-a.html" id="link-id14a9d738">Web 3.0 The Best Official Definition Imaginable</a> -- Nova Spivack&#39;s </li>  </ul>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-27#1520">
  <rss:title>Time for RDBMS Primacy Downgrade is Nigh! (No Embedded Images Edition - Update 1)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-01-27T19:19:44Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">As the world works it way through a &quot;once in a generation&quot; economic crisis, the long overdue downgrade of the RDBMS, from its pivotal position at the apex of the data access and data management pyramid is nigh. What is the Data Access, and Data Management Value Pyramid? As depicted below, a top-down view of the data access and data management value chain. The term: apex, simply indicates value primacy, which takes the form of a data access API based entry point into a DBMS realm -- aligned to an underlying data model. Examples of data access APIs include: Native Call Level Interfaces (CLIs), ODBC, JDBC, ADO.NET, OLE-DB, XMLA, and Web Services. See: AVF Pyramid Diagram. The degree to which ad-hoc views of data managed by a DBMS can be produced and dispatched to relevant data consumers (e.g. people), without compromising concurrency, data durability, and security, collectively determine the &quot;Agility Value Factor&quot; (AVF) of a given DBMS. Remember, agility as the cornerstone of environmental adaptation is as old as the concept of evolution, and intrinsic to all pursuits of primacy. In simpler business oriented terms, look at AVF as the degree to which DBMS technology affects the ability to effectively implement &quot;Market Leadership Discipline&quot; along the following pathways: innovation, operation excellence, or customer intimacy. Why has RDBMS Primacy has Endured? Historically, at least since the late &#39;80s, the RDBMS genre of DBMS has consistently offered the highest AVF relative to other DBMS genres en route to primacy within the value pyramid. The desire to improve on paper reports and spreadsheets is basically what DBMS technology has fundamentally addressed to date, even though conceptual level interaction with data has never been its forte. See: RDBMS Primacy Diagram. For more then 10 years -- at the very least -- limitations of the traditional RDBMS in the realm of conceptual level interaction with data across diverse data sources and schemas (enterprise, Web, and Internet) has been crystal clear to many RDBMS technology practitioners, as indicated by some of the quotes excerpted below: &quot;Future of Database Research is excellent, but what is the future of data?&quot; &quot;..it is hard for me to disagree with the conclusions in this report. It captures exactly the right thoughts, and should be a must read for everyone involved in the area of databases and database research in particular.&quot; -- Dr. Anant Jingran, CTO, IBM Information Management Systems, commenting on the 2007 RDBMS technology retreat attended by a number of key DBMS technology pioneers and researchers. &quot;One size fits all: A concept whose time has come and gone They are direct descendants of System R and Ingres and were architected more than 25 years ago They are advocating &quot;one size fits all&quot;; i.e. a single engine that solves all DBMS needs. -- Prof. Michael Stonebreaker, one of the founding fathers of the RDBMS industry. Until this point in time, the requisite confluence of &quot;circumstantial pain&quot; and &quot;open standards&quot; based technology required to enable an objective &quot;compare and contrast&quot; of RDBMS engine virtues and viable alternatives hasn&#39;t occurred. Thus, the RDBMS has endured it position of primacy albeit on a &quot;one size fits all basis&quot;. Circumstantial Pain As mentioned earlier, we are in the midst of an economic crisis that is ultimately about a consistent inability to connect dots across a substrate of interlinked data sources that transcend traditional data access boundaries with high doses of schematic heterogeneity. Ironically, in a era of the dot-com, we haven&#39;t been able to make meaningful connections between relevant &quot;real-world things&quot; that extend beyond primitive data hosted database tables and content management style document containers; we&#39;ve struggled to achieve this in the most basic sense, let alone evolve our ability to connect inline with the exponential rate at which the Internet &amp; Web are spawning &quot;universes of discourse&quot; (data spaces) that emanate from user activity (within the enterprise and across the Internet &amp; Web). In a nutshell, we haven&#39;t been able to upgrade our interaction with data such that &quot;conceptual models&quot; and resulting &quot;context lenses&quot; (or facets) become concrete; by this I mean: real-world entity interaction making its way into the computer realm as opposed to the impedance we all suffer today when we transition from conceptual model interaction (real-world) to logical model interaction (when dealing with RDBMS based data access and data management). Here are some simple examples of what I can only best describe as: &quot;critical dots unconnected&quot;, resulting from an inability to interact with data conceptually: Government (Globally) - Financial regulatory bodies couldn&#39;t effectively discern that a Credit Default Swap is an Insurance policy in all but literal name. And in not doing so the cost of an unregulated insurance policy laid the foundation for exacerbating the toxicity of fatally flawed mortgage backed securities. Put simply: a flawed insurance policy was the fallback on a toxic security that financiers found exotic based on superficial packaging. Enterprises - Banks still don&#39;t understand that capital really does exists in tangible and intangible forms; with the intangible being the variant that is inherently dynamic. For example, a tech companies intellectual capital far exceeds the value of fixture, fittings, and buildings, but you be amazed to find that in most cases this vital asset has not significant value when banks get down to the nitty gritty of debt collateral; instead, a buffer of flawed securitization has occurred atop a borderline static asset class covering the aforementioned buildings, fixtures, and fittings. In the general enterprise arena, IT executives continued to &quot;rip and replace&quot; existing technology without ever effectively addressing the timeless inability to connect data across disparate data silos generated by internal enterprise applications, let alone the broader need to mesh data from the inside with external data sources. No correlations made between the growth of buzzwords and the compounding nature of data integration challenges. It&#39;s 2009 and only a miniscule number of executives dare fantasize about being anywhere within distance of the: relevant information at your fingertips vision. Looking more holistically at data interaction in general, whether you interact with data in the enterprise space (i.e., at work) or on the Internet or Web, you ultimately are delving into a mishmash of disparate computer systems, applications, service (Web or SOA), and databases (of the RDBMS variety in a majority of cases) associated with a plethora of disparate schemas. Yes, but even today &quot;rip and replace&quot; is still the norm pushed by most vendors; pitting one mono culture against another as exemplified by irrelevances such as: FOSS/LAMP vs Commercial or Web vs. Enterprise, when none of this matters if the data access and integration issues are recognized let alone addressed (see: Applications are Like Fish and Data Like Wine). Like the current credit-crunch, exponential growth of data originating from disparate application databases and associated schemas, within shrinking processing time frames, has triggered a rethinking of what defines data access and data management value today en route to an inevitable RDBMS downgrade within the value pyramid. Technology There have been many attempts to address real-world modeling requirements across the broader DBMS community from Object Databases to Object-Relational Databases, and more recently the emergence of simple Entity-Attribute-Value model DBMS engines. In all cases failure has come down to the existence of one or more of the following deficiencies, across each potential alternative: Query language standardization - nothing close to SQL standardization Data Access API standardization - nothing close to ODBC, JDBC, OLE-DB, or ADO.NET Wire protocol standardization - nothing close to HTTP Distributed Identity infrastructure - nothing close to the non-repudiatable digital Identity that foaf+ssl accords Use of Identifiers as network based pointers to data sources - nothing close to RDF based Linked Data Negotiable data representation - nothing close to Mime and HTTP based Content Negotiation Scalability especially in the era of Internet &amp; Web scale. Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) data models A common characteristic shared by all post-relational DBMS management systems (from Object Relational to pure Object) is an orientation towards variations of EAV/CR based data models. Unfortunately, all efforts in the EAV/CR realm have typically suffered from at least one of the deficiencies listed above. In addition, the same &quot;one DBMS model fits all&quot; approach that lies at the heart of the RDBMS downgrade also exists in the EAV/CR realm. What Comes Next? The RDBMS is not going away (ever), but its era of primacy -- by virtue of its placement at the apex of the data access and data management value pyramid -- is over! I make this bold claim for the following reasons: The Internet aided &quot;Global Village&quot; has brought &quot;Open World&quot; vs &quot;Closed World&quot; assumption issues to the fore e.g., the current global economic crisis remains centered on the inability to connect dots across &quot;Open World&quot; and &quot;Closed World&quot; data frontiers Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) based DBMS models are more effective when dealing with disparate data associated with disparate schemas, across disparate DBMS engines, host operating systems, and networks. Based on the above, it is crystal clear that a different kind of DBMS -- one with higher AVF relative to the RDBMS -- needs to sit atop today&#39;s data access and data management value pyramid. The characteristics of this DBMS must include the following: Every item of data (Datum/Entity/Object/Resource) has Identity Identity is achieved via Identifiers that aren&#39;t locked at the DBMS, OS, Network, or Application levels Object Identifiers and Object values are independent (extricably linked by association) Object values should be de-referencable via Object Identifier Representation of de-referenced value graph (entity, attributes, and values mesh) must be negotiable (i.e. content negotiation) Structured query language must provide mechanism for Creation, Deletion, Updates, and Querying of data objects Performance &amp; Scalability across &quot;Closed World&quot; (enterprise) and &quot;Open World&quot; (Internet &amp; Web) realms. Quick recap, I am not saying that RDBMS engine technology is dead or obsolete. I am simply stating that the era of RDBMS primacy within the data access and data management value pyramid is over. The problem domain (conceptual model views over heterogeneous data sources) at the apex of the aforementioned pyramid has simply evolved beyond the natural capabilities of the RDBMS which is rooted in &quot;Closed World&quot; assumptions re., data definition, access, and management. The need to maintain domain based conceptual interaction with data is now palpable at every echelon within our &quot;Global Village&quot; - Internet, Web, Enterprise, Government etc. It is my personal view that an EAV/CR model based DBMS, with support for the seven items enumerated above, can trigger the long anticipated RDBMS downgrade. Such a DBMS would be inherently multi-model because you would need to the best of RDBMS and EAV/CR model engines in a single product, with in-built support for HTTP and other Internet protocols in order to effectively address data representation and serialization issues. EAV/CR Oriented Data Access &amp; Management Technology Examples of contemporary EAV/CR frameworks that provide concrete conceptual layers for data access and data management currently include: Resource Description Framework (RDF) - an EAV/CR based framework RDF Linked Data - EAV/CR based framework that mandates de-referencable HTTP based Identifiers ADO.NET Entity Frameworks - Microsoft .NET based EAV/CR framework Core Data Services - Mac OS X based EAV/CR framework that evolved from NeXT&#39;s Enterprise Object Frameworks (EOF). The frameworks above provide the basis for a revised AVF pyramid, as depicted below, that reflects today&#39;s data access and management realities i.e., an Internet &amp; Web driven global village comprised of interlinked distributed data objects, compatible with &quot;Open World&quot; assumptions. See: New EAV/CR Primacy Diagram. Related How &amp; Why Glue is Using Amazon SimpleDB Object Database Manifesto (Identity excerpt) Database Models Overview Ted Nelson Explaining Irregularity and Idiosyncrasy of Data Structures - ZigZag Demo</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p> As the world works it way through a &quot;once in a generation&quot; economic crisis, the long overdue downgrade of the <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id15750540">RDBMS</a>, from its pivotal position at the apex of the <a href="http://dbpedia.org/resource/Data" id="link-id0x24ea3650">data</a> access and data management pyramid is nigh.</p> <h3>What is the Data Access, and Data Management Value Pyramid?</h3> <p> As depicted below, a top-down view of the data access and data management value chain. The term: apex, simply indicates value primacy, which takes the form of a data access API based entry point into a DBMS realm -- aligned to an underlying data model. Examples of data access APIs include: Native Call Level Interfaces (CLIs), <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id11c254c0">ODBC</a>, <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id149b16a8">JDBC</a>, <a href="http://dbpedia.org/resource/ADO.NET" id="link-id11451eb0">ADO</a>.NET, <a href="http://dbpedia.org/resource/OLE_DB" id="link-id15b02478">OLE-DB</a>, <a href="http://dbpedia.org/resource/XML_for_Analysis" id="link-id1181fa10">XMLA</a>, and <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id0x1f8394a8">Web</a> Services.</p> See: <a href="http://virtuoso.openlinksw.com/images/Agility_Value_Factors_Pyramid.png" id="link-id146cadd8"> AVF Pyramid Diagram.</a> <p> The degree to which ad-hoc views of data managed by a DBMS can be produced and dispatched to relevant data consumers (e.g. people), without compromising concurrency, data durability, and security, collectively determine the &quot;Agility Value Factor&quot; (AVF) of a given DBMS. Remember, agility as the cornerstone of environmental adaptation is as old as the concept of evolution, and intrinsic to all pursuits of primacy. </p> <p>In simpler business oriented terms, look at AVF as the degree to which DBMS technology affects the ability to effectively implement &quot;Market Leadership Discipline&quot; along the following pathways: innovation, operation excellence, or customer intimacy. </p> <h3>Why has RDBMS Primacy has Endured?</h3> <p> Historically, at least since the late &#39;80s, the RDBMS genre of DBMS has consistently offered the highest AVF relative to other DBMS genres en route to primacy within the value pyramid. The desire to improve on paper reports and spreadsheets is basically what DBMS technology has fundamentally addressed to date, even though conceptual level interaction with data has never been its forte.</p> See: <a href="http://virtuoso.openlinksw.com/images/Old_RDBMS_Primacy_Pyramid.png" id="link-id134dab90"> RDBMS Primacy Diagram.</a> <p> For more then 10 years -- at the very least -- limitations of the traditional RDBMS in the realm of conceptual level interaction with data across diverse data sources and schemas (enterprise, Web, and <a href="http://dbpedia.org/resource/Internet" id="link-id116001c0">Internet</a>) has been crystal clear to many RDBMS technology practitioners, as indicated by some of the quotes excerpted below:</p> <blockquote> <cite> <p> &quot;Future of Database Research is excellent, but what is the future of data?&quot; </p> &quot;..it is hard for <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id14932398">me</a> to disagree with the conclusions in this report. It captures exactly the right thoughts, and should be a must read for everyone involved in the area of databases and database research in particular.&quot; <p>-- <a href="http://jhingran.typepad.com/anant_jhingrans_musings/" id="link-id11334c50">Dr. Anant Jingran</a>, CTO, IBM <a href="http://dbpedia.org/resource/Information" id="link-id150c7970">Information</a> Management Systems, commenting on the <a href="http://db.cs.berkeley.edu/claremont/" id="link-id11c3b408">2007 RDBMS technology retreat</a> attended by a number of key DBMS technology pioneers and researchers.</p> </cite> </blockquote> <blockquote> <cite> <p> &quot;<a href="http://www.databasecolumn.com/2007/09/one-size-fits-all.html" id="link-id15c14f08">One size fits all: A concept whose time has come and gone</a> </p> <p> </p> <ol> <li> They are direct descendants of System R and <a href="http://dbpedia.org/resource/Ingres" id="link-id146da780">Ingres</a> and were architected more than 25 years ago</li> <li> They are advocating &quot;one size fits all&quot;; i.e. a single engine that solves all DBMS needs. </li> </ol> <p>-- Prof. <a href="http://en.wikipedia.org/wiki/Michael_Stonebraker" id="link-id145c4e28">Michael Stonebreaker</a>, one of the founding fathers of the RDBMS industry.</p> </cite> </blockquote> <p>Until this point in time, the requisite confluence of &quot;circumstantial pain&quot; and &quot;open standards&quot; based technology required to enable an objective &quot;compare and contrast&quot; of RDBMS engine virtues and viable alternatives hasn&#39;t occurred. Thus, the RDBMS has endured it position of primacy albeit on a &quot;one size fits all basis&quot;. </p> <h4>Circumstantial Pain</h4> <p> As mentioned earlier, we are in the midst of an economic crisis that is ultimately about a consistent inability to connect dots across a substrate of interlinked data sources that transcend traditional data access boundaries with high doses of schematic heterogeneity. Ironically, in a era of the dot-com, we haven&#39;t been able to make meaningful connections between relevant &quot;real-world things&quot; that extend beyond primitive data hosted database tables and content management style document containers; we&#39;ve struggled to achieve this in the most basic sense, let alone evolve our ability to connect inline with the <a href="http://www.vldb2007.org/program/slides/s1161-brodie.pdf" id="link-id11a0dcf0">exponential rate at which the Internet &amp; Web are spawning &quot;universes of discourse&quot; (data spaces) that emanate from user activity</a> (within the enterprise and across the Internet &amp; Web). In a nutshell, we haven&#39;t been able to upgrade our interaction with data such that &quot;conceptual models&quot; and resulting &quot;<a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id12da4b00">context</a> lenses&quot; (or facets) become concrete; by this I mean: real-world <a href="http://dbpedia.org/resource/Entity" id="link-id146a48a8">entity</a> interaction making its way into the computer realm as opposed to the impedance we all suffer today when we transition from conceptual model interaction (real-world) to logical model interaction (when dealing with RDBMS based data access and data management). </p> <p>Here are some simple examples of what I can only best describe as: &quot;critical dots unconnected&quot;, resulting from an inability to interact with data conceptually:</p> <strong>Government (Globally) -</strong> <p> Financial regulatory bodies couldn&#39;t effectively discern that a <a href="http://dbpedia.org/resource/Credit_default_swap" id="link-id115ba0e0">Credit Default Swap</a> is an Insurance policy in all but literal name. And in not doing so the cost of an unregulated <a href="http://dbpedia.org/resource/Insurance" id="link-id158d4960">insurance policy</a> laid the foundation for exacerbating the toxicity of fatally flawed mortgage backed securities. Put simply: a flawed insurance policy was the fallback on a toxic security that financiers found exotic based on superficial packaging.</p> <strong>Enterprises - </strong> <p> Banks still don&#39;t understand that capital really does exists in tangible and intangible forms; with the intangible being the variant that is inherently dynamic. For example, a tech companies intellectual capital far exceeds the value of fixture, fittings, and buildings, but you be amazed to find that in most cases this vital asset has not significant value when banks get down to the nitty gritty of debt collateral; instead, a buffer of flawed securitization has occurred atop a borderline static asset class covering the aforementioned buildings, fixtures, and fittings. </p> <p> In the general enterprise arena, IT executives continued to &quot;rip and replace&quot; existing technology without ever effectively addressing the timeless inability to connect data across disparate data silos generated by internal enterprise applications, let alone the broader need to mesh data from the inside with external data sources. No correlations made between the growth of buzzwords and the compounding nature of data integration challenges. It&#39;s 2009 and only a miniscule number of executives dare fantasize about being anywhere within distance of the: relevant information at your fingertips vision. </p> <p> Looking more holistically at data interaction in general, whether you interact with data in the enterprise space (i.e., at work) or on the Internet or Web, you ultimately are delving into a mishmash of disparate computer systems, applications, service (Web or SOA), and databases (of the RDBMS variety in a majority of cases) associated with a plethora of disparate schemas. Yes, but even today &quot;rip and replace&quot; is still the norm pushed by most vendors; pitting one mono culture against another as exemplified by irrelevances such as: FOSS/LAMP vs Commercial or Web vs. Enterprise, when none of this matters if the data access and integration issues are recognized let alone addressed (see: <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1497?sid=0df0294caee8b37925c6a888bbbca136&amp;realm=wa" id="link-id15c27300">Applications are Like Fish and Data Like Wine</a>). </p> <p> Like the current credit-crunch, exponential growth of data originating from disparate application databases and associated schemas, within shrinking processing time frames, has triggered a rethinking of what defines data access and data management value today en route to an inevitable RDBMS downgrade within the value pyramid.</p> <h3>Technology</h3> <p>There have been many attempts to address real-world modeling requirements across the broader DBMS community from Object Databases to Object-Relational Databases, and more recently the emergence of simple <a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id1128dad0">Entity</a>-Attribute-Value model DBMS engines. In all cases failure has come down to the existence of one or more of the following deficiencies, across each potential alternative:</p> <ol> <li>Query language standardization - nothing close to <a href="http://dbpedia.org/resource/SQL" id="link-id16002d60">SQL</a> standardization</li> <li>Data Access API standardization - nothing close to ODBC, JDBC, OLE-DB, or ADO.NET</li> <li>Wire protocol standardization - nothing close to HTTP</li> <li>Distributed Identity infrastructure - nothing close to the non-repudiatable digital Identity that <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id14926b18">foaf</a>+ssl accords</li> <li>Use of Identifiers as network based pointers to data sources - nothing close to RDF based <a href="http://dbpedia.org/resource/Linked_Data" id="link-id16180a28">Linked Data</a> </li> <li>Negotiable data representation - nothing close to Mime and HTTP based Content Negotiation</li> <li>Scalability especially in the era of Internet &amp; Web scale.</li> </ol> <h4>Entity-Attribute-Value with Classes &amp; Relationships (<a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id13e741b8">EAV</a>/CR) data models</h4> <p>A common characteristic shared by all post-relational DBMS management systems (from Object Relational to pure Object) is an orientation towards variations of EAV/CR based data models. Unfortunately, all efforts in the EAV/CR realm have typically suffered from at least one of the deficiencies listed above. In addition, the same &quot;one DBMS model fits all&quot; approach that lies at the heart of the RDBMS downgrade also exists in the EAV/CR realm.</p> <h3>What Comes Next?</h3> <p>The RDBMS is not going away (ever), but its era of primacy -- by virtue of its placement at the apex of the data access and data management value pyramid -- is over! I make this bold claim for the following reasons: </p> <ol> <li> The Internet aided &quot;Global Village&quot; has brought &quot;<a href="http://en.wikipedia.org/wiki/Open_World_Assumption" id="link-id1148e560">Open World</a>&quot; vs &quot;<a href="http://en.wikipedia.org/wiki/Closed_World_Assumption" id="link-id11967cd0">Closed World</a>&quot; assumption issues to the fore e.g., the current global economic crisis remains centered on the inability to connect dots across &quot;Open World&quot; and &quot;Closed World&quot; data frontiers </li> <li> Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) based DBMS models are more effective when dealing with disparate data associated with disparate schemas, across disparate DBMS engines, host operating systems, and networks. </li> </ol> <p>Based on the above, it is crystal clear that a different kind of DBMS -- one with higher AVF relative to the RDBMS -- needs to sit atop today&#39;s data access and data management value pyramid. The characteristics of this DBMS must include the following:</p> <ol> <li> Every item of data (Datum/Entity/Object/Resource) has Identity</li> <li> Identity is achieved via Identifiers that aren&#39;t locked at the DBMS, OS, Network, or Application levels</li> <li> Object Identifiers and Object values are independent (extricably linked by association)</li> <li> Object values should be de-referencable via Object Identifier</li> <li> Representation of de-referenced value graph (entity, attributes, and values mesh) must be negotiable (i.e. content negotiation)</li> <li>Structured query language must provide mechanism for Creation, Deletion, Updates, and Querying of data objects</li> <li> Performance &amp; Scalability across &quot;Closed World&quot; (enterprise) and &quot;Open World&quot; (Internet &amp; Web) realms.</li> </ol> <p>Quick recap, I am not saying that RDBMS engine technology is dead or obsolete. I am simply stating that the era of RDBMS primacy within the data access and data management value pyramid is over. </p> <p>The problem domain (conceptual model views over heterogeneous data sources) at the apex of the aforementioned pyramid has simply evolved beyond the natural capabilities of the RDBMS which is rooted in &quot;Closed World&quot; assumptions re., data definition, access, and management. The need to maintain domain based conceptual interaction with data is now palpable at every echelon within our &quot;Global Village&quot; - Internet, Web, Enterprise, Government etc.</p> <p>It is my personal view that an EAV/CR model based DBMS, with support for the seven items enumerated above, can trigger the long anticipated RDBMS downgrade. Such a DBMS would be inherently multi-model because you would need to the best of RDBMS and EAV/CR model engines in a single product, with in-built support for HTTP and other Internet protocols in order to effectively address data representation and serialization issues.</p> <h4>EAV/CR Oriented Data Access &amp; Management Technology</h4> <p>Examples of contemporary EAV/CR frameworks that provide concrete conceptual layers for data access and data management currently include:</p> <ul> <li> <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id115d1cb0"> Resource Description Framework</a> (RDF) - an EAV/CR based framework</li> <li> <a href="http://dbpedia.org/resource/Linked_Data" id="link-id116cf810">RDF Linked Data </a>- EAV/CR based framework that mandates de-referencable HTTP based Identifiers</li> <li> <a href="http://dbpedia.org/resource/ADO.NET_Entity_Framework" id="link-id13daa160">ADO.NET Entity Frameworks</a> - Microsoft .NET based EAV/CR framework</li> <li> <a href="http://dbpedia.org/page/Core_Data" id="link-id11111838">Core Data Services </a>- Mac OS X based EAV/CR framework that evolved from NeXT&#39;s <a href="http://dbpedia.org/resource/Enterprise_Objects_Framework" id="link-id15c27df0">Enterprise Object Frameworks</a> (EOF).</li> </ul> <p>The frameworks above provide the basis for a revised AVF pyramid, as depicted below, that reflects today&#39;s data access and management realities i.e., an Internet &amp; Web driven global village comprised of interlinked distributed data objects, compatible with &quot;Open World&quot; assumptions.</p> See: <a href="http://virtuoso.openlinksw.com/images/New_EAV_RDBMS_Pyramid.png" id="link-id158e0760">New EAV/CR Primacy Diagram.</a> <h3>Related</h3> <ul> <li> <a href="http://dynamicorange.com/2009/01/22/blueblog-how-and-why-glue-is-using-amazon-simpledb-instead-of-a-relational-database/" id="link-id15e07c10">How &amp; Why Glue is Using Amazon SimpleDB</a> </li> <li> <a href="http://www.cs.cmu.edu/afs/cs.cmu.edu/user/clamen/OODBMS/Manifesto/htManifesto/node4.html#SECTION00022000000000000000" id="link-id116cf450">Object Database Manifesto (Identity excerpt)</a> </li> <li> <a href="http://www.unixspace.com/context/databases.html" id="link-id150b2c20">Database Models Overview</a> </li>
<li>
  <a href="http://www.youtube.com/watch?v=WEj9vqVvHPc&amp;feature=related" id="link-id0x1135d978">Ted Nelson Explaining Irregularity and Idiosyncrasy of Data Structures</a> - ZigZag Demo </li> </ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-24#1519">
  <rss:title>The Time for RDBMS Primacy Downgrade is Nigh!</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-01-25T00:04:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">As the world works it way through a &quot;once in a generation&quot; economic crisis, the long overdue downgrade of the RDBMS, from its pivotal position at the apex of the data access and data management pyramid is nigh. What is the Data Access, and Data Management Value Pyramid? As depicted below, a top-down view of the data access and data management value chain. The term: apex, simply indicates value primacy, which takes the form of a data access API based entry point into a DBMS realm -- aligned to an underlying data model. Examples of data access APIs include: Native Call Level Interfaces (CLIs), ODBC, JDBC, ADO.NET, OLE-DB, XMLA, and Web Services. The degree to which ad-hoc views of data managed by a DBMS can be produced and dispatched to relevant data consumers (e.g. people), without compromising concurrency, data durability, and security, collectively determine the &quot;Agility Value Factor&quot; (AVF) of a given DBMS. Remember, agility as the cornerstone of environmental adaptation is as old as the concept of evolution, and intrinsic to all pursuits of primacy. In simpler business oriented terms, look at AVF as the degree to which DBMS technology affects the ability to effectively implement &quot;Market Leadership Discipline&quot; along the following pathways: innovation, operation excellence, or customer intimacy. Why has RDBMS Primacy has Endured? Historically, at least since the late &#39;80s, the RDBMS genre of DBMS has consistently offered the highest AVF relative to other DBMS genres en route to primacy within the value pyramid. The desire to improve on paper reports and spreadsheets is basically what DBMS technology has fundamentally addressed to date, even though conceptual level interaction with data has never been its forte. For more then 10 years -- at the very least -- limitations of the traditional RDBMS in the realm of conceptual level interaction with data across diverse data sources and schemas (enterprise, Web, and Internet) has been crystal clear to many RDBMS technology practitioners, as indicated by some of the quotes excerpted below: &quot;Future of Database Research is excellent, but what is the future of data?&quot; &quot;..it is hard for me to disagree with the conclusions in this report. It captures exactly the right thoughts, and should be a must read for everyone involved in the area of databases and database research in particular.&quot; -- Dr. Anant Jingran, CTO, IBM Information Management Systems, commenting on the 2007 RDBMS technology retreat attended by a number of key DBMS technology pioneers and researchers. &quot;One size fits all: A concept whose time has come and gone They are direct descendants of System R and Ingres and were architected more than 25 years ago They are advocating &quot;one size fits all&quot;; i.e. a single engine that solves all DBMS needs. -- Prof. Michael Stonebreaker, one of the founding fathers of the RDBMS industry. Until this point in time, the requisite confluence of &quot;circumstantial pain&quot; and &quot;open standards&quot; based technology required to enable an objective &quot;compare and contrast&quot; of RDBMS engine virtues and viable alternatives hasn&#39;t occurred. Thus, the RDBMS has endured it position of primacy albeit on a &quot;one size fits all basis&quot;. Circumstantial Pain As mentioned earlier, we are in the midst of an economic crisis that is ultimately about a consistent inability to connect dots across a substrate of interlinked data sources that transcend traditional data access boundaries with high doses of schematic heterogeneity. Ironically, in a era of the dot-com, we haven&#39;t been able to make meaningful connections between relevant &quot;real-world things&quot; that extend beyond primitive data hosted database tables and content management style document containers; we&#39;ve struggled to achieve this in the most basic sense, let alone evolve our ability to connect inline with the exponential rate at which the Internet &amp; Web are spawning &quot;universes of discourse&quot; (data spaces) that emanate from user activity (within the enterprise and across the Internet &amp; Web). In a nutshell, we haven&#39;t been able to upgrade our interaction with data such that &quot;conceptual models&quot; and resulting &quot;context lenses&quot; (or facets) become concrete; by this I mean: real-world entity interaction making its way into the computer realm as opposed to the impedance we all suffer today when we transition from conceptual model interaction (real-world) to logical model interaction (when dealing with RDBMS based data access and data management). Here are some simple examples of what I can only best describe as: &quot;critical dots unconnected&quot;, resulting from an inability to interact with data conceptually: Government (Globally) - Financial regulatory bodies couldn&#39;t effectively discern that a Credit Default Swap is an Insurance policy in all but literal name. And in not doing so the cost of an unregulated insurance policy laid the foundation for exacerbating the toxicity of fatally flawed mortgage backed securities. Put simply: a flawed insurance policy was the fallback on a toxic security that financiers found exotic based on superficial packaging. Enterprises - Banks still don&#39;t understand that capital really does exists in tangible and intangible forms; with the intangible being the variant that is inherently dynamic. For example, a tech companies intellectual capital far exceeds the value of fixture, fittings, and buildings, but you be amazed to find that in most cases this vital asset has not significant value when banks get down to the nitty gritty of debt collateral; instead, a buffer of flawed securitization has occurred atop a borderline static asset class covering the aforementioned buildings, fixtures, and fittings. In the general enterprise arena, IT executives continued to &quot;rip and replace&quot; existing technology without ever effectively addressing the timeless inability to connect data across disparate data silos generated by internal enterprise applications, let alone the broader need to mesh data from the inside with external data sources. No correlations made between the growth of buzzwords and the compounding nature of data integration challenges. It&#39;s 2009 and only a miniscule number of executives dare fantasize about being anywhere within distance of the: relevant information at your fingertips vision. Looking more holistically at data interaction in general, whether you interact with data in the enterprise space (i.e., at work) or on the Internet or Web, you ultimately are delving into a mishmash of disparate computer systems, applications, service (Web or SOA), and databases (of the RDBMS variety in a majority of cases) associated with a plethora of disparate schemas. Yes, but even today &quot;rip and replace&quot; is still the norm pushed by most vendors; pitting one mono culture against another as exemplified by irrelevances such as: FOSS/LAMP vs Commercial or Web vs. Enterprise, when none of this matters if the data access and integration issues are recognized let alone addressed (see: Applications are Like Fish and Data Like Wine). Like the current credit-crunch, exponential growth of data originating from disparate application databases and associated schemas, within shrinking processing time frames, has triggered a rethinking of what defines data access and data management value today en route to an inevitable RDBMS downgrade within the value pyramid. Technology There have been many attempts to address real-world modeling requirements across the broader DBMS community from Object Databases to Object-Relational Databases, and more recently the emergence of simple Entity-Attribute-Value model DBMS engines. In all cases failure has come down to the existence of one or more of the following deficiencies, across each potential alternative: Query language standardization - nothing close to SQL standardization Data Access API standardization - nothing close to ODBC, JDBC, OLE-DB, or ADO.NET Wire protocol standardization - nothing close to HTTP Distributed Identity infrastructure - nothing close to the non-repudiatable digital Identity that foaf+ssl accords Use of Identifiers as network based pointers to data sources - nothing close to RDF based Linked Data Negotiable data representation - nothing close to Mime and HTTP based Content Negotiation Scalability especially in the era of Internet &amp; Web scale. Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) data models A common characteristic shared by all post-relational DBMS management systems (from Object Relational to pure Object) is an orientation towards variations of EAV/CR based data models. Unfortunately, all efforts in the EAV/CR realm have typically suffered from at least one of the deficiencies listed above. In addition, the same &quot;one DBMS model fits all&quot; approach that lies at the heart of the RDBMS downgrade also exists in the EAV/CR realm. What Comes Next? The RDBMS is not going away (ever), but its era of primacy -- by virtue of its placement at the apex of the data access and data management value pyramid -- is over! I make this bold claim for the following reasons: The Internet aided &quot;Global Village&quot; has brought &quot;Open World&quot; vs &quot;Closed World&quot; assumption issues to the fore e.g., the current global economic crisis remains centered on the inability to connect dots across &quot;Open World&quot; and &quot;Closed World&quot; data frontiers Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) based DBMS models are more effective when dealing with disparate data associated with disparate schemas, across disparate DBMS engines, host operating systems, and networks. Based on the above, it is crystal clear that a different kind of DBMS -- one with higher AVF relative to the RDBMS -- needs to sit atop today&#39;s data access and data management value pyramid. The characteristics of this DBMS must include the following: Every item of data (Datum/Entity/Object/Resource) has Identity Identity is achieved via Identifiers that aren&#39;t locked at the DBMS, OS, Network, or Application levels Object Identifiers and Object values are independent (extricably linked by association) Object values should be de-referencable via Object Identifier Representation of de-referenced value graph (entity, attributes, and values mesh) must be negotiable (i.e. content negotiation) Structured query language must provide mechanism for Creation, Deletion, Updates, and Querying of data objects Performance &amp; Scalability across &quot;Closed World&quot; (enterprise) and &quot;Open World&quot; (Internet &amp; Web) realms. Quick recap, I am not saying that RDBMS engine technology is dead or obsolete. I am simply stating that the era of RDBMS primacy within the data access and data management value pyramid is over. The problem domain (conceptual model views over heterogeneous data sources) at the apex of the aforementioned pyramid has simply evolved beyond the natural capabilities of the RDBMS which is rooted in &quot;Closed World&quot; assumptions re., data definition, access, and management. The need to maintain domain based conceptual interaction with data is now palpable at every echelon within our &quot;Global Village&quot; - Internet, Web, Enterprise, Government etc. It is my personal view that an EAV/CR model based DBMS, with support for the seven items enumerated above, can trigger the long anticipated RDBMS downgrade. Such a DBMS would be inherently multi-model because you would need to the best of RDBMS and EAV/CR model engines in a single product, with in-built support for HTTP and other Internet protocols in order to effectively address data representation and serialization issues. EAV/CR Oriented Data Access &amp; Management Technology Examples of contemporary EAV/CR frameworks that provide concrete conceptual layers for data access and data management currently include: Resource Description Framework (RDF) - an EAV/CR based framework RDF Linked Data - EAV/CR based framework that mandates de-referencable HTTP based Identifiers ADO.NET Entity Frameworks - Microsoft .NET based EAV/CR framework Core Data Services - Mac OS X based EAV/CR framework that evolved from NeXT&#39;s Enterprise Object Frameworks (EOF). The frameworks above provide the basis for a revised AVF pyramid, as depicted below, that reflects today&#39;s data access and management realities i.e., an Internet &amp; Web driven global village comprised of interlinked distributed data objects, compatible with &quot;Open World&quot; assumptions. Related The Semantic Way - Alan Cho&#39;s Summary of PwC 2009 tech forecast report on the Semantic Web Is the RDBMS Doomed - ReadWriteWeb Article Anti-RDBMS: a list of Distributed Key-Value Stores - by Richard Jones (CTO Last.FM) How &amp; Why Glue is Using Amazon SimpleDB Object Database Manifesto (Identity excerpt) Database Models Overview Ted Nelson Explaining Irregularity and Idiosyncrasy of Data Structures - ZigZag Demo</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p> As the world works it way through a &quot;once in a generation&quot; economic crisis, the long overdue downgrade of the <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id15750540">RDBMS</a>, from its pivotal position at the apex of the <a href="http://dbpedia.org/resource/Data" id="link-id0x66a74b8">data</a> access and data management pyramid is nigh.</p> <h3>What is the Data Access, and Data Management Value Pyramid?</h3> <p> As depicted below, a top-down view of the data access and data management value chain. The term: apex, simply indicates value primacy, which takes the form of a data access API based entry point into a DBMS realm -- aligned to an underlying data model. Examples of data access APIs include: Native Call Level Interfaces (CLIs), <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id11c254c0">ODBC</a>, <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id149b16a8">JDBC</a>, <a href="http://dbpedia.org/resource/ADO.NET" id="link-id11451eb0">ADO</a>.NET, <a href="http://dbpedia.org/resource/OLE_DB" id="link-id15b02478">OLE-DB</a>, <a href="http://dbpedia.org/resource/XML_for_Analysis" id="link-id1181fa10">XMLA</a>, and <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id0x2fef498">Web</a> Services.</p> <div> <img alt="Image" src="http://virtuoso.openlinksw.com/images/Agility_Value_Factors_Pyramid.png" /> </div> <p> The degree to which ad-hoc views of data managed by a DBMS can be produced and dispatched to relevant data consumers (e.g. people), without compromising concurrency, data durability, and security, collectively determine the &quot;Agility Value Factor&quot; (AVF) of a given DBMS. Remember, agility as the cornerstone of environmental adaptation is as old as the concept of evolution, and intrinsic to all pursuits of primacy. </p> <p>In simpler business oriented terms, look at AVF as the degree to which DBMS technology affects the ability to effectively implement &quot;Market Leadership Discipline&quot; along the following pathways: innovation, operation excellence, or customer intimacy. </p> <h3>Why has RDBMS Primacy has Endured?</h3> <p> Historically, at least since the late &#39;80s, the RDBMS genre of DBMS has consistently offered the highest AVF relative to other DBMS genres en route to primacy within the value pyramid. The desire to improve on paper reports and spreadsheets is basically what DBMS technology has fundamentally addressed to date, even though conceptual level interaction with data has never been its forte.</p> <div> <img alt="Image" src="http://virtuoso.openlinksw.com/images/Old_RDBMS_Primacy_Pyramid.png" /> </div> <p> For more then 10 years -- at the very least -- limitations of the traditional RDBMS in the realm of conceptual level interaction with data across diverse data sources and schemas (enterprise, Web, and <a href="http://dbpedia.org/resource/Internet" id="link-id116001c0">Internet</a>) has been crystal clear to many RDBMS technology practitioners, as indicated by some of the quotes excerpted below:</p> <blockquote> <cite> <p> &quot;Future of Database Research is excellent, but what is the future of data?&quot; </p> &quot;..it is hard for <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id14932398">me</a> to disagree with the conclusions in this report. It captures exactly the right thoughts, and should be a must read for everyone involved in the area of databases and database research in particular.&quot; <p>-- <a href="http://jhingran.typepad.com/anant_jhingrans_musings/" id="link-id11334c50">Dr. Anant Jingran</a>, CTO, IBM <a href="http://dbpedia.org/resource/Information" id="link-id150c7970">Information</a> Management Systems, commenting on the <a href="http://db.cs.berkeley.edu/claremont/" id="link-id11c3b408">2007 RDBMS technology retreat</a> attended by a number of key DBMS technology pioneers and researchers.</p> </cite> </blockquote> <blockquote> <cite> <p> &quot;<a href="http://www.databasecolumn.com/2007/09/one-size-fits-all.html" id="link-id15c14f08">One size fits all: A concept whose time has come and gone</a> </p> <p> </p> <ol> <li> They are direct descendants of System R and <a href="http://dbpedia.org/resource/Ingres" id="link-id146da780">Ingres</a> and were architected more than 25 years ago</li> <li> They are advocating &quot;one size fits all&quot;; i.e. a single engine that solves all DBMS needs. </li> </ol> <p>-- Prof. <a href="http://en.wikipedia.org/wiki/Michael_Stonebraker" id="link-id145c4e28">Michael Stonebreaker</a>, one of the founding fathers of the RDBMS industry.</p> </cite> </blockquote> <p>Until this point in time, the requisite confluence of &quot;circumstantial pain&quot; and &quot;open standards&quot; based technology required to enable an objective &quot;compare and contrast&quot; of RDBMS engine virtues and viable alternatives hasn&#39;t occurred. Thus, the RDBMS has endured it position of primacy albeit on a &quot;one size fits all basis&quot;. </p> <h4>Circumstantial Pain</h4> <p> As mentioned earlier, we are in the midst of an economic crisis that is ultimately about a consistent inability to connect dots across a substrate of interlinked data sources that transcend traditional data access boundaries with high doses of schematic heterogeneity. Ironically, in a era of the dot-com, we haven&#39;t been able to make meaningful connections between relevant &quot;real-world things&quot; that extend beyond primitive data hosted database tables and content management style document containers; we&#39;ve struggled to achieve this in the most basic sense, let alone evolve our ability to connect inline with the <a href="http://www.vldb2007.org/program/slides/s1161-brodie.pdf" id="link-id11a0dcf0">exponential rate at which the Internet &amp; Web are spawning &quot;universes of discourse&quot; (data spaces) that emanate from user activity</a> (within the enterprise and across the Internet &amp; Web). In a nutshell, we haven&#39;t been able to upgrade our interaction with data such that &quot;conceptual models&quot; and resulting &quot;<a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id12da4b00">context</a> lenses&quot; (or facets) become concrete; by this I mean: real-world <a href="http://dbpedia.org/resource/Entity" id="link-id146a48a8">entity</a> interaction making its way into the computer realm as opposed to the impedance we all suffer today when we transition from conceptual model interaction (real-world) to logical model interaction (when dealing with RDBMS based data access and data management). </p> <p>Here are some simple examples of what I can only best describe as: &quot;critical dots unconnected&quot;, resulting from an inability to interact with data conceptually:</p> <strong>Government (Globally) -</strong> <p> Financial regulatory bodies couldn&#39;t effectively discern that a <a href="http://dbpedia.org/resource/Credit_default_swap" id="link-id115ba0e0">Credit Default Swap</a> is an Insurance policy in all but literal name. And in not doing so the cost of an unregulated <a href="http://dbpedia.org/resource/Insurance" id="link-id158d4960">insurance policy</a> laid the foundation for exacerbating the toxicity of fatally flawed mortgage backed securities. Put simply: a flawed insurance policy was the fallback on a toxic security that financiers found exotic based on superficial packaging.</p> <strong>Enterprises - </strong> <p> Banks still don&#39;t understand that capital really does exists in tangible and intangible forms; with the intangible being the variant that is inherently dynamic. For example, a tech companies intellectual capital far exceeds the value of fixture, fittings, and buildings, but you be amazed to find that in most cases this vital asset has not significant value when banks get down to the nitty gritty of debt collateral; instead, a buffer of flawed securitization has occurred atop a borderline static asset class covering the aforementioned buildings, fixtures, and fittings. </p> <p> In the general enterprise arena, IT executives continued to &quot;rip and replace&quot; existing technology without ever effectively addressing the timeless inability to connect data across disparate data silos generated by internal enterprise applications, let alone the broader need to mesh data from the inside with external data sources. No correlations made between the growth of buzzwords and the compounding nature of data integration challenges. It&#39;s 2009 and only a miniscule number of executives dare fantasize about being anywhere within distance of the: relevant information at your fingertips vision. </p> <p> Looking more holistically at data interaction in general, whether you interact with data in the enterprise space (i.e., at work) or on the Internet or Web, you ultimately are delving into a mishmash of disparate computer systems, applications, service (Web or SOA), and databases (of the RDBMS variety in a majority of cases) associated with a plethora of disparate schemas. Yes, but even today &quot;rip and replace&quot; is still the norm pushed by most vendors; pitting one mono culture against another as exemplified by irrelevances such as: FOSS/LAMP vs Commercial or Web vs. Enterprise, when none of this matters if the data access and integration issues are recognized let alone addressed (see: <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1497?sid=0df0294caee8b37925c6a888bbbca136&amp;realm=wa" id="link-id15c27300">Applications are Like Fish and Data Like Wine</a>). </p> <p> Like the current credit-crunch, exponential growth of data originating from disparate application databases and associated schemas, within shrinking processing time frames, has triggered a rethinking of what defines data access and data management value today en route to an inevitable RDBMS downgrade within the value pyramid.</p> <h3>Technology</h3> <p>There have been many attempts to address real-world modeling requirements across the broader DBMS community from Object Databases to Object-Relational Databases, and more recently the emergence of simple <a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id1128dad0">Entity</a>-Attribute-Value model DBMS engines. In all cases failure has come down to the existence of one or more of the following deficiencies, across each potential alternative:</p> <ol> <li>Query language standardization - nothing close to <a href="http://dbpedia.org/resource/SQL" id="link-id16002d60">SQL</a> standardization</li> <li>Data Access API standardization - nothing close to ODBC, JDBC, OLE-DB, or ADO.NET</li> <li>Wire protocol standardization - nothing close to HTTP</li> <li>Distributed Identity infrastructure - nothing close to the non-repudiatable digital Identity that <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id14926b18">foaf</a>+ssl accords</li> <li>Use of Identifiers as network based pointers to data sources - nothing close to RDF based <a href="http://dbpedia.org/resource/Linked_Data" id="link-id16180a28">Linked Data</a> </li> <li>Negotiable data representation - nothing close to Mime and HTTP based Content Negotiation</li> <li>Scalability especially in the era of Internet &amp; Web scale.</li> </ol> <h4>Entity-Attribute-Value with Classes &amp; Relationships (<a href="http://dbpedia.org/resource/Entity-attribute-value_model" id="link-id13e741b8">EAV</a>/CR) data models</h4> <p>A common characteristic shared by all post-relational DBMS management systems (from Object Relational to pure Object) is an orientation towards variations of EAV/CR based data models. Unfortunately, all efforts in the EAV/CR realm have typically suffered from at least one of the deficiencies listed above. In addition, the same &quot;one DBMS model fits all&quot; approach that lies at the heart of the RDBMS downgrade also exists in the EAV/CR realm.</p> <h3>What Comes Next?</h3> <p>The RDBMS is not going away (ever), but its era of primacy -- by virtue of its placement at the apex of the data access and data management value pyramid -- is over! I make this bold claim for the following reasons: </p> <ol> <li> The Internet aided &quot;Global Village&quot; has brought &quot;<a href="http://en.wikipedia.org/wiki/Open_World_Assumption" id="link-id1148e560">Open World</a>&quot; vs &quot;<a href="http://en.wikipedia.org/wiki/Closed_World_Assumption" id="link-id11967cd0">Closed World</a>&quot; assumption issues to the fore e.g., the current global economic crisis remains centered on the inability to connect dots across &quot;Open World&quot; and &quot;Closed World&quot; data frontiers </li> <li> Entity-Attribute-Value with Classes &amp; Relationships (EAV/CR) based DBMS models are more effective when dealing with disparate data associated with disparate schemas, across disparate DBMS engines, host operating systems, and networks. </li> </ol> <p>Based on the above, it is crystal clear that a different kind of DBMS -- one with higher AVF relative to the RDBMS -- needs to sit atop today&#39;s data access and data management value pyramid. The characteristics of this DBMS must include the following:</p> <ol> <li> Every item of data (Datum/Entity/Object/Resource) has Identity</li> <li> Identity is achieved via Identifiers that aren&#39;t locked at the DBMS, OS, Network, or Application levels</li> <li> Object Identifiers and Object values are independent (extricably linked by association)</li> <li> Object values should be de-referencable via Object Identifier</li> <li> Representation of de-referenced value graph (entity, attributes, and values mesh) must be negotiable (i.e. content negotiation)</li> <li>Structured query language must provide mechanism for Creation, Deletion, Updates, and Querying of data objects</li> <li> Performance &amp; Scalability across &quot;Closed World&quot; (enterprise) and &quot;Open World&quot; (Internet &amp; Web) realms.</li> </ol> <p>Quick recap, I am not saying that RDBMS engine technology is dead or obsolete. I am simply stating that the era of RDBMS primacy within the data access and data management value pyramid is over. </p> <p>The problem domain (conceptual model views over heterogeneous data sources) at the apex of the aforementioned pyramid has simply evolved beyond the natural capabilities of the RDBMS which is rooted in &quot;Closed World&quot; assumptions re., data definition, access, and management. The need to maintain domain based conceptual interaction with data is now palpable at every echelon within our &quot;Global Village&quot; - Internet, Web, Enterprise, Government etc.</p> <p>It is my personal view that an EAV/CR model based DBMS, with support for the seven items enumerated above, can trigger the long anticipated RDBMS downgrade. Such a DBMS would be inherently multi-model because you would need to the best of RDBMS and EAV/CR model engines in a single product, with in-built support for HTTP and other Internet protocols in order to effectively address data representation and serialization issues.</p> <h4>EAV/CR Oriented Data Access &amp; Management Technology</h4> <p>Examples of contemporary EAV/CR frameworks that provide concrete conceptual layers for data access and data management currently include:</p> <ul> <li> <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id115d1cb0"> Resource Description Framework</a> (RDF) - an EAV/CR based framework</li> <li> <a href="http://dbpedia.org/resource/Linked_Data" id="link-id116cf810">RDF Linked Data </a>- EAV/CR based framework that mandates de-referencable HTTP based Identifiers</li> <li> <a href="http://dbpedia.org/resource/ADO.NET_Entity_Framework" id="link-id13daa160">ADO.NET Entity Frameworks</a> - Microsoft .NET based EAV/CR framework</li> <li> <a href="http://dbpedia.org/page/Core_Data" id="link-id11111838">Core Data Services </a>- Mac OS X based EAV/CR framework that evolved from NeXT&#39;s <a href="http://dbpedia.org/resource/Enterprise_Objects_Framework" id="link-id15c27df0">Enterprise Object Frameworks</a> (EOF).</li> </ul> <p>The frameworks above provide the basis for a revised AVF pyramid, as depicted below, that reflects today&#39;s data access and management realities i.e., an Internet &amp; Web driven global village comprised of interlinked distributed data objects, compatible with &quot;Open World&quot; assumptions.</p> <div> <image src="http://virtuoso.openlinksw.com/images/New_EAV_RDBMS_Pyramid.png"></image> </div> <h3>Related</h3> <ul> 
<li>
  <a href="http://allanslibrary.blogspot.com/2009/06/semantic-way.html" id="link-id0xb8c5e498">The Semantic Way</a> - Alan Cho&#39;s Summary of <a href="http://www.pwc.com/extweb/home.nsf/docid/1308AF8EA7929CCA852575BA00720F26" id="link-id0xb80f5e10">PwC 2009 tech forecast report on the Semantic Web</a>
</li>
<li>
  <a href="http://www.readwriteweb.com/archives/is_the_relational_database_doomed.php" id="link-id0xb8c20658">Is the RDBMS Doomed</a> - <a href="http://www.readwriteweb.com">ReadWriteWeb</a> Article</li>
<li>
  <a href="http://www.metabrew.com/article/anti-rdbms-a-list-of-distributed-key-value-stores/" id="link-id0x1ab4778">Anti-RDBMS: a list of Distributed Key-Value Stores</a> - by <a href="http://www.last.fm/user/RJ" id="link-id0x5a968060">Richard Jones</a> (CTO Last.FM)</li>
<li> <a href="http://dynamicorange.com/2009/01/22/blueblog-how-and-why-glue-is-using-amazon-simpledb-instead-of-a-relational-database/" id="link-id15e07c10">How &amp; Why Glue is Using Amazon SimpleDB</a> </li> <li> <a href="http://www.cs.cmu.edu/afs/cs.cmu.edu/user/clamen/OODBMS/Manifesto/htManifesto/node4.html#SECTION00022000000000000000" id="link-id116cf450">Object Database Manifesto (Identity excerpt)</a> </li> <li> <a href="http://www.unixspace.com/context/databases.html" id="link-id150b2c20">Database Models Overview</a> </li> <li> <a href="http://www.youtube.com/watch?v=WEj9vqVvHPc&amp;feature=related" id="link-id0x66b0850">Ted Nelson Explaining Irregularity and Idiosyncrasy of Data Structures</a> - ZigZag Demo </li> </ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2009-01-09#1517">
  <rss:title>A Linked Data Web Approach To Semantic &quot;Search&quot; &amp; &quot;Find&quot; (Updated)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2009-01-09T23:34:50Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">The first salvo of what we&#39;ve been hinting about re. server side faceted browsing over Unlimited Data within configurable Interactive Time-frames is now available for experimentation at: http://b3s.openlinksw.com/fct/facet.vsp. Simple example / demo: Enter search pattern: Microsoft You will get the usual result from a full text pattern search i.e., hits and text excerpts with matching patterns in boldface. This first step is akin to throwing your net out to sea while fishing. Now you have your catch, what next? Basically, this is where traditional text search value ends since regex or xpath/xquery offer little when the structure of literal text is the key to filtering or categorization based analysis of real-world entities. Naturally, this is where the value of structured querying of linked data starts, as you seek to use entity descriptions (combination of attribute and relationship properties) to &quot;Find relevant things&quot;. Continuing with the demo. Click on &quot;Properties&quot; link within the Navigation section of the browser page which results in a distillation and aggregation of the properties of the entities associated with the search results. Then use the &quot;Next&quot; link to page through the properties until to find the properties that best match what you seek. Note, this particular step is akin to using the properties of the catch (using fishing analogy) for query filtering, with each subsequent property link click narrowing your selection further. Using property based filtering is just one perspective on the data corpus associated with the text search pattern; thus, you can alter perspectives by clicking on the &quot;Class&quot; link so that you can filter you search results by entity type. Of course, in a number of scenarios you would use a combination of entity types and entity properties filters to locate the entities of interest to you. A Few Notes about this demo instance of Virtuoso: Lookup Data Size (Local Linked Data Corpus): 2 Billion+ Triples (entity-attribute-value tuples) This is a *temporary* teaser / precursor to the LOD (Linking Open Data Cloud) variant of our Linked Data driven &quot;Search&quot; &amp; &quot;Find&quot; service; we decided to implement this functionality prior to commissioning a larger and more up to date instance based on the entire LOD Cloud The browser is simply using a Virtuoso PL function that also exists in Web Service form for loose binding by 3rd parties that have a UI orientation and focus (our UI is deliberately bare boned). The properties and entity types (classes) links expose formal definitions and dictionary provenance information materialized in an HTML page (of course your browser or any other HTTP user agent can negotiation alternative representations of this descriptive information) UMBEL based inference rules are enabled, giving you a live and simple demonstration of the virtues of Linked Data Dictionaries for example: click on the description link of any property or class from the foaf (friend-of-a-friend vocabulary), sioc (semantically-interlinked-online-communities ontology), mo (music ontology), bibo (bibliographic data ontology) namespaces to see how the data between these lower level vocabularies or ontologies are meshed with OpenCyc&#39;s upper level ontology. Related Faceted Search: Unlimited Data in Interactive Time Virtuoso Anytime: No Query Is Too Complex</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p>The first salvo of what we&#39;ve been hinting about re. server side faceted browsing over Unlimited <a href="http://dbpedia.org/resource/Data">Data</a> within configurable Interactive Time-frames is now available for experimentation at:
<a href="http://b3s.openlinksw.com/fct/facet.vsp" id="link-ide41d210">http://b3s.openlinksw.com/fct/facet.vsp</a>.</p>

<h3>Simple example / demo:</h3>

<p>Enter search pattern: Microsoft</p>

<p>You will get the usual result from a full text pattern search i.e., hits and text excerpts with matching patterns in boldface. This first step is akin to throwing your net out to sea while fishing.</p>
<p>
Now you have your catch, what next? Basically, this is where traditional text search value ends since <a href="http://dbpedia.org/resource/regular_expression" id="link-id113b6840">regex</a> or <a href="http://dbpedia.org/resource/XPath" id="link-id1151c140">xpath</a>/<a href="http://dbpedia.org/resource/XQuery" id="link-id14565db8">xquery</a> offer little when the structure of literal text is the key to filtering or categorization based analysis of real-world entities. Naturally, this is where the value of structured querying of <a href="http://dbpedia.org/resource/Linked_Data" id="link-id11bc8208">linked data</a> starts, as you seek to use <a href="http://dbpedia.org/resource/Entity" id="link-id150e7298">entity</a> descriptions (combination of attribute and relationship properties) to &quot;Find relevant things&quot;.</p>

<p>Continuing with the demo.</p>

<p>Click on &quot;Properties&quot; link within the Navigation section of the browser page which results in a distillation and aggregation of the properties of the entities associated with the search results. Then use the  &quot;Next&quot; link to page through the properties until to find the properties that best match what you seek. Note, this particular step is akin to using the properties of the catch (using fishing analogy) for query filtering, with each subsequent property link click narrowing your selection further.</p>

<p>Using property based filtering is just one perspective on the data corpus associated with the text search pattern; thus, you can alter perspectives by clicking on the &quot;Class&quot; link so that you can filter you search results by entity type. Of course, in a number of scenarios you would use a combination of entity types and entity properties filters to locate the entities of interest to you. </p>

<h3>A Few Notes about this demo instance of <a href="http://virtuoso.openlinksw.com" id="link-id14453088">Virtuoso</a>:</h3>
<ul>
<li>
Lookup Data Size (Local Linked Data Corpus): 2 Billion+ Triples (<a href="http://dbpedia.org/page/Entity-attribute-value_model" id="link-id13447558">entity-attribute-value</a> tuples)</li>
<li>
This is a *temporary* teaser / precursor to the <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id14e3bfc8">LOD</a> (Linking Open Data Cloud) variant of our Linked Data driven &quot;Search&quot; &amp; &quot;Find&quot; service; we decided to implement this functionality prior to commissioning a larger and more up to date instance based on the entire LOD Cloud</li>
<li>
The browser is simply using a <a href="http://virtuoso.openlinksw.com" id="link-id138b5688">Virtuoso</a> PL function that also exists in <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> Service form for loose binding by 3rd parties that have a UI orientation and focus (our UI is deliberately bare boned).</li>
<li>The properties and entity types (classes) links expose formal definitions and dictionary provenance <a href="http://dbpedia.org/resource/Information" id="link-id10ecc8e0">information</a> materialized in an HTML page (of course your browser or any other HTTP user agent can negotiation alternative representations of this descriptive information)</li> 
<li>
  <a href="http://umbel.org/about/" id="link-id117b95e0">UMBEL</a> based inference rules are enabled, giving you a live and simple demonstration of the virtues of Linked Data Dictionaries for example: click on the description link of any property or class from the <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id1595dd88">foaf</a> (friend-of-a-friend vocabulary), <a href="http://dbpedia.org/resource/SIOC" id="link-id151315e8">sioc</a> (semantically-interlinked-online-communities ontology), <a href="http://musicontology.com/" id="link-id15b9d6e8">mo</a> (music ontology), <a href="http://bibliontology.com/" id="link-id114257e8">bibo</a> (bibliographic data ontology) namespaces to see how the data between these lower level vocabularies or ontologies are meshed with <a href="http://dbpedia.org/resource/Cyc" id="link-id15b9be80">OpenCyc</a>&#39;s upper level ontology.
</li>
</ul>

<h3>Related</h3>
<ul>
<li>
<a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1515" id="link-id14694eb8">Faceted Search: Unlimited Data in Interactive Time</a>
</li>
<li>
<a href="http://blogs.usnet.private:8893/Virtuoso Anytime: No Query Is Too Complex (updated)" id="link-id1356c630">Virtuoso Anytime: No Query Is Too Complex</a>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-12-16#1499">
  <rss:title>&quot;E Pluribus Unum&quot;, or &quot;Inversely Functional Identity&quot;, or &quot;Smooshing Without the Stickiness&quot; (re-updated)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-12-16T14:14:43Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">What a terrible word, smooshing... I have understood it to mean that when you have two names for one thing, you give each all the attributes of the other. This smooshes them together, makes them interchangeable. This is complex, so I will begin with the point and the interested may read on for the details and implications. Starting with soon to be released version 6, Virtuoso allows you to say that two things, if they share a uniquely identifying property, are the same. Examples of uniquely identifying properties would be a book&#39;s ISBN number, or a person&#39;s social security plus full name. In relational language this is a unique key, and in RDF parlance, an inverse functional property. In most systems, such problems are dealt with as a preprocessing step before querying. For example, all the items that are considered the same will get the same properties or at load time all identifiers will be normalized according to some application rules. This is good if the rules are clear and understood. This is so in closed situations, where things tend to have standard identifiers to begin with. But on the open web this is not so clear cut. In this post, we show how to do these things ad hoc, without materializing anything. At the end, we also show how to materialize identity and what the consequences of this are with open web data. We use real live web crawls from the Billion Triples Challenge data set. On the linked data web, there are independently arising descriptions of the same thing and thus arises the need to smoosh, if these are to be somehow integrated. But this is only the beginning of the problems. To address these, we have added the option of specifying that some property will be considered inversely functional in a query. This is done at run time and the property does not really have to be inversely functional in the pure sense. foaf:name will do for an example. This simply means that for purposes of the query concerned, two subjects which have at least one foaf:name in common are considered the same. In this way, we can join between FOAF files. With the same database, a query about music preferences might consider having the same name as &quot;same enough,&quot; but a query about criminal prosecution would obviously need to be more precise about sameness. Our ontology is defined like this: -- Populate a named graph with the triples you want to use in query time inferencing ttlp ( &#39; @prefix foaf: &lt;xmlns=&quot;http&quot; xmlns.com=&quot;xmlns.com&quot; foaf=&quot;foaf&quot;&gt; &lt;/&gt; @prefix owl: &lt;xmlns=&quot;http&quot; www.w3.org=&quot;www.w3.org&quot; owl=&quot;owl&quot;&gt; &lt;/&gt; foaf:mbox_sha1sum a owl:InverseFunctionalProperty . foaf:name a owl:InverseFunctionalProperty . &#39;, &#39;xx&#39;, &#39;b3sifp&#39; ); -- Declare that the graph contains an ontology for use in query time inferencing rdfs_rule_set ( &#39;http://example.com/rules/b3sifp#&#39;, &#39;b3sifp&#39; ); Then use it: sparql DEFINE input:inference &quot;http://example.com/rules/b3sifp#&quot; SELECT DISTINCT ?k ?f1 ?f2 WHERE { ?k foaf:name ?n . ?n bif:contains &quot;&#39;Kjetil Kjernsmo&#39;&quot; . ?k foaf:knows ?f1 . ?f1 foaf:knows ?f2 }; VARCHAR VARCHAR VARCHAR ______________________________________ _______________________________________________ ______________________________ http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/dajobe http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/net_twitter http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/amyvdh http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/pom http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/mattb http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/davorg http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/distobj http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/perigrin .... Without the inference, we get no matches. This is because the data in question has one graph per FOAF file, and blank nodes for persons. No graph references any person outside the ones in the graph. So if somebody is mentioned as known, then without the inference there is no way to get to what that person&#39;s FOAF file says, since the same individual will be a different blank node there. The declaration in the context named b3sifp just means that all things with a matching foaf:name or foaf:mbox_sha1sum are the same. Sameness means that two are the same for purposes of DISTINCT or GROUP BY, and if two are the same, then both have the UNION of all of the properties of both. If this were a naive smoosh, then the individuals would have all the same properties but would not be the same for DISTINCT. If we have complex application rules for determining whether individuals are the same, then one can materialize owl:sameAs triples and keep them in a separate graph. In this way, the original data is not contaminated and the materialized volume stays reasonable â nothing like the blow-up of duplicating properties across instances. The pro-smoosh argument is that if every duplicate makes exactly the same statements, then there is no great blow-up. Best and worst cases will always depend on the data. In rough terms, the more ad hoc the use, the less desirable the materialization. If the usage pattern is really set, then a relational-style application-specific representation with identity resolved at load time will perform best. We can do that too, but so can others. The principal point is about agility as concerns the inference. Run time is more agile than materialization, and if the rules change or if different users have different needs, then materialization runs into trouble. When talking web scale, having multiple users is a given; it is very uneconomical to give everybody their own copy, and the likelihood of a user accessing any significant part of the corpus is minimal. Even if the queries were not limited, the user would typically not wait for the answer of a query doing a scan or aggregation over 1 billion blog posts or something of the sort. So queries will typically be selective. Selective means that they do not access all of the data, hence do not benefit from ready-made materialization for things they do not even look at. The exception is corpus-wide statistics queries. But these will not be done in interactive time anyway, and will not be done very often. Plus, since these do not typically run all in memory, these are disk bound. And when things are disk bound, size matters. Reading extra entailment on the way is just a performance penalty. Enough talk. Time for an experiment. We take the Yahoo and Falcon web crawls from the Billion Triples Challenge set, and do two things with the FOAF data in them: Resolve identity at insert time. We remove duplicate person URIs, and give the single URI all the properties of all the duplicate URIs. We expect these to be most often repeats. If a person references another person, we normalize this reference to go to the single URI of the referenced person. Give every duplicate URI of a person all the properties of all the duplicates. If these are the same value, the data should not get much bigger, or so we think. For the experiment, we will consider two people the same if they have the same foaf:name and are both instances of foaf:Person. This gets some extra hits but should not be statistically significant. The following is a commented SQL script performing the smoosh. We play with internal IDs of things, thus some of these operations cannot be done in SPARQL alone. We use SPARQL where possible for readability. As the documentation states, iri_to_id converts from the qualified name of an IRI to its ID and id_to_iri does the reverse. We count the triples that enter into the smoosh: -- the name is an existence because else we&#39;d get several times more due to -- the names occurring in many graphs sparql SELECT COUNT(*) WHERE { { SELECT DISTINCT ?person WHERE { ?person a foaf:Person } } . FILTER ( bif:exists ( SELECT (1) WHERE { ?person foaf:name ?nn } ) ) . ?person ?p ?o }; -- We get 3284674 We make a few tables for intermediate results. -- For each distinct name, gather the properties and objects from -- all subjects with this name CREATE TABLE name_prop ( np_name ANY, np_p IRI_ID_8, np_o ANY, PRIMARY KEY ( np_name, np_p, np_o ) ); ALTER INDEX name_prop ON name_prop PARTITION ( np_name VARCHAR (-1, 0hexffff) ); -- Map from name to canonical IRI used for the name CREATE TABLE name_iri ( ni_name ANY PRIMARY KEY, ni_s IRI_ID_8 ); ALTER INDEX name_iri ON name_iri PARTITION ( ni_name VARCHAR (-1, 0hexffff) ); -- Map from person IRI to canonical person IRI CREATE TABLE pref_iri ( i IRI_ID_8, pref IRI_ID_8, PRIMARY KEY ( i ) ); ALTER INDEX pref_iri ON pref_iri PARTITION ( i INT (0hexffff00) ); -- a table for the materialization where all aliases get all properties of every other CREATE TABLE smoosh_ct ( s IRI_ID_8, p IRI_ID_8, o ANY, PRIMARY KEY ( s, p, o ) ); ALTER INDEX smoosh_ct ON smoosh_ct PARTITION ( s INT (0hexffff00) ); -- disable transaction log and enable row auto-commit. This is necessary, otherwise -- bulk operations are done transactionally and they will run out of rollback space. LOG_ENABLE (2); -- Gather all the properties of all persons with a name under that name. -- INSERT SOFT means that duplicates are ignored INSERT SOFT name_prop SELECT &quot;n&quot;, &quot;p&quot;, &quot;o&quot; FROM ( sparql DEFINE output:valmode &quot;LONG&quot; SELECT ?n ?p ?o WHERE { ?x a foaf:Person . ?x foaf:name ?n . ?x ?p ?o } ) xx ; -- Now choose for each name the canonical IRI INSERT INTO name_iri SELECT np_name, ( SELECT MIN (s) FROM rdf_quad WHERE o = np_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ) AS mini FROM name_prop WHERE np_p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- For each person IRI, map to the canonical IRI of that person INSERT SOFT pref_iri (i, pref) SELECT s, ni_s FROM name_iri, rdf_quad WHERE o = ni_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- Make a graph where all persons have one iri with all the properties of all aliases -- and where person-to-person refs are canonicalized INSERT SOFT rdf_quad (g,s,p,o) SELECT IRI_TO_ID (&#39;psmoosh&#39;), ni_s, np_p, COALESCE ( ( SELECT pref FROM pref_iri WHERE i = np_o ), np_o ) FROM name_prop, name_iri WHERE ni_name = np_name OPTION ( loop, quietcast ) ; -- A little explanation: The properties of names are copied into rdf_quad with the name -- replaced with its canonical IRI. If the object has a canonical IRI, this is used as -- the object, else the object is unmodified. This is the COALESCE with the sub-query. -- This takes a little time. To check on the progress, take another connection to the -- server and do STATUS (&#39;cluster&#39;); -- It will return something like -- Cluster 4 nodes, 35 s. 108 m/s 1001 KB/s 75% cpu 186% read 12% clw threads 5r 0w 0i -- buffers 549481 253929 d 8 w 0 pfs -- Now finalize the state; this makes it permanent. Else the work will be lost on server -- failure, since there was no transaction log CL_EXEC (&#39;checkpoint&#39;); -- See what we got sparql SELECT COUNT (*) FROM &lt;psmoosh&gt; WHERE {?s ?p ?o}; -- This is 2253102 -- Now make the copy where all have the properties of all synonyms. This takes so much -- space we do not insert it as RDF quads, but make a special table for it so that we can -- run some statistics. This saves time. INSERT SOFT smoosh_ct (s, p, o) SELECT s, np_p, np_o FROM name_prop, rdf_quad WHERE o = np_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- as above, INSERT SOFT so as to ignore duplicates SELECT COUNT (*) FROM smoosh_ct; -- This is 167360324 -- Find out where the bloat comes from SELECT TOP 20 COUNT (*), ID_TO_IRI (p) FROM smoosh_ct GROUP BY p ORDER BY 1 DESC; The results are: 54728777 http://www.w3.org/2002/07/owl#sameAs 48543153 http://xmlns.com/foaf/0.1/knows 13930234 http://www.w3.org/2000/01/rdf-schema#seeAlso 12268512 http://xmlns.com/foaf/0.1/interest 11415867 http://xmlns.com/foaf/0.1/nick 6683963 http://xmlns.com/foaf/0.1/weblog 6650093 http://xmlns.com/foaf/0.1/depiction 4231946 http://xmlns.com/foaf/0.1/mbox_sha1sum 4129629 http://xmlns.com/foaf/0.1/homepage 1776555 http://xmlns.com/foaf/0.1/holdsAccount 1219525 http://xmlns.com/foaf/0.1/based_near 305522 http://www.w3.org/1999/02/22-rdf-syntax-ns#type 274965 http://xmlns.com/foaf/0.1/name 155131 http://xmlns.com/foaf/0.1/dateOfBirth 153001 http://xmlns.com/foaf/0.1/img 111130 http://www.w3.org/2001/vcard-rdf/3.0#ADR 52930 http://xmlns.com/foaf/0.1/gender 48517 http://www.w3.org/2004/02/skos/core#subject 45697 http://www.w3.org/2000/01/rdf-schema#label 44860 http://purl.org/vocab/bio/0.1/olb Now compare with the predicate distribution of the smoosh with identities canonicalized sparql SELECT COUNT (*) ?p FROM &lt;psmoosh&gt; WHERE { ?s ?p ?o } GROUP BY ?p ORDER BY 1 DESC LIMIT 20; Results are: 748311 http://xmlns.com/foaf/0.1/knows 548391 http://xmlns.com/foaf/0.1/interest 140531 http://www.w3.org/2000/01/rdf-schema#seeAlso 105273 http://www.w3.org/1999/02/22-rdf-syntax-ns#type 78497 http://xmlns.com/foaf/0.1/name 48099 http://www.w3.org/2004/02/skos/core#subject 45179 http://xmlns.com/foaf/0.1/depiction 40229 http://www.w3.org/2000/01/rdf-schema#comment 38272 http://www.w3.org/2000/01/rdf-schema#label 37378 http://xmlns.com/foaf/0.1/nick 37186 http://dbpedia.org/property/abstract 34003 http://xmlns.com/foaf/0.1/img 26182 http://xmlns.com/foaf/0.1/homepage 23795 http://www.w3.org/2002/07/owl#sameAs 17651 http://xmlns.com/foaf/0.1/mbox_sha1sum 17430 http://xmlns.com/foaf/0.1/dateOfBirth 15586 http://xmlns.com/foaf/0.1/page 12869 http://dbpedia.org/property/reference 12497 http://xmlns.com/foaf/0.1/weblog 12329 http://blogs.yandex.ru/schema/foaf/school We can drop the owl:sameAs triples from the count, so the bloat is a bit less by that but it still is tens of times larger than the canonicalized copy or the initial state. Now, when we try using the psmoosh graph, we still get different results from the results with the original data. This is because foaf:knows relations to things with no foaf:name are not represented in the smoosh. The exist: sparql SELECT COUNT (*) WHERE { ?s foaf:knows ?thing . FILTER ( !bif:exists ( SELECT (1) WHERE { ?thing foaf:name ?nn } ) ) }; -- 1393940 So the smoosh graph is not an accurate rendition of the social network. It would have to be smooshed further to be that, since the data in the sample is quite irregular. But we do not go that far here. Finally, we calculate the smoosh blow up factors. We do not include owl:sameAs triples in the counts. select (167360324 - 54728777) / 3284674.0; 34.290022997716059 select 2229307 / 3284674.0; = 0.678699621332284 So, to get a smoosh that is not really the equivalent of the original, either multiply the original triple count by 34 or 0.68, depending on whether synonyms are collapsed or not. Making the smooshes does not take very long, some minutes for the small one. Inserting the big one would be longer, a couple of hours maybe. It was 33 minutes for filling the smoosh_ct table. The metrics were not with optimal tuning so the performance numbers just serve to show that smooshing takes time. Probably more time than allowable in an interactive situation, no matter how the process is optimized.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>What a terrible word, smooshing...  I have understood it to mean that when you have two names for one thing, you give each all the attributes of the other.  This smooshes them together, makes them interchangeable.</p>

<p>This is complex, so I will begin with the point and the interested may read on for the details and implications.  Starting with soon to be released version 6, <a href="http://virtuoso.openlinksw.com" id="link-id15718cb8">Virtuoso</a> allows you to say that two things, if they share a uniquely identifying property, are the same.  Examples of uniquely identifying properties would be a book&#39;s ISBN number, or a person&#39;s social security plus full name.  In relational language this is a <i>unique key</i>, and in <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id145ed998">RDF</a> parlance, an <i>inverse functional property</i>.</p>

<p>In most systems, such problems are dealt with as a preprocessing step before querying.  For example, all the items that are considered the same will get the same properties or at load time all identifiers will be normalized according to some application rules.  This is good if the rules are clear and understood.  This is so in closed situations, where things tend to have standard identifiers to begin with.  But on the open web this is not so clear cut.</p>

<p>In this post, we show how to do these things <i>ad hoc</i>, without materializing anything.  At the end, we also show how to materialize identity and what the consequences of this are with open web <a href="http://dbpedia.org/resource/Data" id="link-id11726358">data</a>.  We use real live web crawls from the <a href="http://challenge.semanticweb.org/" id="link-id14f40448">Billion Triples Challenge</a> data set.</p>

<p>On the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id156e2b10">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id1106ce08">web</a>, there are independently arising descriptions of the same thing and thus arises the need to smoosh, if these are to be somehow integrated.  But this is only the beginning of the problems.</p>

<p>To address these, we have added the option of specifying that some property will be considered inversely functional in a query.  This is done at run time and the property does not really have to be inversely functional in the pure sense.  <code>foaf:name</code> will do for an example.  This simply means that for purposes of the query concerned, two subjects which have at least one <code>foaf:name</code> in common are considered the same. In this way, we can join between FOAF files.  With the same database, a query about music preferences might consider having the same name as &quot;same enough,&quot; but a query about criminal prosecution would obviously need to be more precise about sameness.</p>

<p>Our ontology is defined like this:</p>

<blockquote>
<pre>-- Populate a named graph with the triples you want to use in query time inferencing<br />
ttlp ( &#39;
        @prefix foaf: &lt;xmlns=&quot;http&quot; xmlns.com=&quot;xmlns.com&quot; foaf=&quot;foaf&quot;&gt;
                      &lt;/&gt;
        @prefix owl:  &lt;xmlns=&quot;http&quot; www.w3.org=&quot;www.w3.org&quot; owl=&quot;owl&quot;&gt;
                      &lt;/&gt;
        foaf:mbox_sha1sum  a  owl:InverseFunctionalProperty  .
        foaf:name          a  owl:InverseFunctionalProperty  .
       &#39;,
       &#39;xx&#39;,
       &#39;b3sifp&#39;
     );<br />
-- Declare that the graph contains an ontology for use in query time inferencing <br />
rdfs_rule_set ( &#39;http://example.com/rules/b3sifp#&#39;,
                &#39;b3sifp&#39;
              );
</pre></blockquote>

<p>Then use it:</p>

<blockquote>
<pre>sparql 
   DEFINE input:inference &quot;http://example.com/rules/b3sifp#&quot; 
   SELECT DISTINCT ?k ?f1 ?f2 
   WHERE { ?k   foaf:name     ?n                   . 
           ?n   bif:contains  &quot;&#39;Kjetil Kjernsmo&#39;&quot;  . 
           ?k   foaf:knows    ?f1                  . 
           ?f1  foaf:knows    ?f2 
         };<br />
VARCHAR                                  VARCHAR                                           VARCHAR
______________________________________   _______________________________________________   ______________________________<br />
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/dajobe
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/net_twitter
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/amyvdh
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/pom
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/mattb
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/davorg
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/distobj
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/perigrin
....
</pre></blockquote>

<p>Without the inference, we get no matches.  This is because the data in question has one graph per FOAF file, and blank nodes for persons.  No graph references any person outside the ones in the graph.  So if somebody is mentioned as known, then without the inference there is no way to get to what that person&#39;s FOAF file says, since the same individual will be a different blank node there.  The declaration in the context named <code>b3sifp</code> just means that all things with a matching <code>foaf:name</code> or <code>foaf:mbox_sha1sum</code> are the same.</p>

<p>Sameness means that two are the same for purposes of <code>DISTINCT</code> or <code>GROUP BY</code>, and if two are the same, then both have the <code>UNION</code> of all of the properties of both.</p>

<p>If this were a naive smoosh, then the individuals would have all the same properties but would not be the same for <code>DISTINCT</code>.</p>

<p>If we have complex application rules for determining whether individuals are the same, then one can materialize <code>owl:sameAs</code> triples and keep them in a separate graph.  In this way, the original data is not contaminated and the materialized volume stays reasonable â nothing like the blow-up of duplicating properties across instances.</p>

<p>The pro-smoosh argument is that if every duplicate makes exactly the same statements, then there is no great blow-up.  Best and worst cases will always depend on the data.  In rough terms, the more <i>ad hoc</i> the use, the less desirable the materialization.  If the usage pattern is really set, then a relational-style application-specific representation with identity resolved at load time will perform best.  We can do that too, but so can others.</p>

<p>The principal point is about agility as concerns the inference.  Run time is more agile than materialization, and if the rules change or if different users have different needs, then materialization runs into trouble.  When talking web scale, having multiple users is a given; it is very uneconomical to give everybody their own copy, and the likelihood of a user accessing any significant part of the corpus is minimal.  Even if the queries were not limited, the user would typically not wait for the answer of a query doing a scan or aggregation over 1 billion <a href="http://dbpedia.org/resource/Blog" id="link-id1156a550">blog</a> posts or something of the sort.  So queries will typically be selective.  Selective means that they do not access all of the data, hence do not benefit from ready-made materialization for things they do not even look at. </p>

<p>The exception is corpus-wide statistics queries.  But these will not be done in interactive time anyway, and will not be done very often. Plus, since these do not typically run all in memory, these are disk bound.  And when things are disk bound, size matters.  Reading extra entailment on the way is just a performance penalty.</p>

<p>Enough talk. Time for an experiment.  We take the Yahoo and Falcon web crawls from the Billion Triples Challenge set, and do two things with the FOAF data in them:</p>

<ol>
<li>Resolve identity at insert time.  We remove duplicate person URIs, and give the single <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id11317008">URI</a> all the properties of all the duplicate URIs.  We expect these to be most often repeats.  If a person references another person, we normalize this reference to go to the single URI of the referenced person.</li>

<li>Give every duplicate URI of a person all the properties of all the duplicates.  If these are the same value, the data should not get much bigger, or so we think.</li>
</ol>

<p>For the experiment, we will consider two people the same if they have the same <code>foaf:name</code> and are both instances of <code>foaf:Person</code>.  This gets some extra hits but should not be statistically significant.</p>

<p>The following is a commented <a href="http://dbpedia.org/resource/SQL" id="link-id110945b0">SQL</a> script performing the smoosh.  We play with internal IDs of things, thus some of these operations cannot be done in SPARQL alone.  We use SPARQL where possible for readability.  As the documentation states, <code>iri_to_id</code> converts from the qualified name of an IRI to its ID and <code>id_to_iri</code> does the reverse.</p>

<p>We count the triples that enter into the smoosh:</p>

<blockquote>
<pre>-- the name is an existence because else we&#39;d get several times more due to 
-- the names occurring in many graphs <br />
sparql 
   SELECT COUNT(*) 
    WHERE { { SELECT DISTINCT ?person 
               WHERE { ?person a foaf:Person }
            } . 
            FILTER ( bif:exists ( SELECT (1) 
                                   WHERE { ?person foaf:name ?nn } 
                                )
                       ) . 
            ?person ?p ?o
          };<br />
-- We get 3284674
</pre></blockquote>

<p>We make a few tables for intermediate results.</p>

<blockquote>
<pre>-- For each distinct name, gather the properties and objects from 
-- all subjects with this name <br />
CREATE TABLE name_prop 
   ( np_name  ANY, 
     np_p     IRI_ID_8, 
     np_o     ANY, 
     PRIMARY KEY ( np_name, 
                   np_p, 
                   np_o
                 )
   );
ALTER INDEX name_prop 
   ON name_prop 
   PARTITION ( np_name VARCHAR (-1, 0hexffff) );<br />
-- Map from name to canonical IRI used for the name <br />
CREATE TABLE name_iri ( ni_name  ANY PRIMARY KEY, 
                        ni_s     IRI_ID_8
                      );
ALTER INDEX name_iri 
   ON name_iri 
   PARTITION ( ni_name VARCHAR (-1, 0hexffff) );<br />
-- Map from person IRI to canonical person IRI<br />
CREATE TABLE pref_iri 
   ( i     IRI_ID_8, 
     pref  IRI_ID_8, 
     PRIMARY KEY ( i )
   );
ALTER INDEX pref_iri 
   ON pref_iri 
   PARTITION ( i INT (0hexffff00) );<br />
-- a table for the materialization where all aliases get all properties of every other <br />
CREATE TABLE smoosh_ct 
   ( s  IRI_ID_8, 
     p  IRI_ID_8, 
     o  ANY, 
     PRIMARY KEY ( s, 
                   p, 
                   o
                 ) 
   );
ALTER INDEX smoosh_ct 
   ON smoosh_ct 
   PARTITION ( s INT (0hexffff00) );<br />
-- disable transaction log and enable row auto-commit.  This is necessary, otherwise 
-- bulk operations are done transactionally and they will run out of rollback space.<br />
LOG_ENABLE (2);<br />
-- Gather all the properties of all persons with a name under that name.  
-- INSERT SOFT means that duplicates are ignored <br />
INSERT SOFT name_prop 
   SELECT &quot;n&quot;, &quot;p&quot;, &quot;o&quot; 
   FROM ( sparql 
          DEFINE output:valmode &quot;LONG&quot; 
          SELECT ?n ?p ?o 
          WHERE { ?x a foaf:Person . 
                 ?x foaf:name ?n . 
                 ?x ?p ?o
               }
        ) xx ;<br />
-- Now choose for each name the canonical IRI <br />
INSERT INTO name_iri 
   SELECT np_name, 
          ( SELECT MIN (s) 
              FROM rdf_quad 
             WHERE o = np_name 
                   AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;)
          ) AS mini 
     FROM name_prop 
    WHERE np_p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- For each person IRI, map to the canonical IRI of that person <br />
INSERT SOFT pref_iri (i, pref) 
   SELECT s, 
          ni_s 
     FROM name_iri, 
          rdf_quad 
    WHERE o = ni_name 
          AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- Make a graph where all persons have one iri with all the properties of all aliases 
-- and where person-to-person refs are canonicalized<br />
INSERT SOFT rdf_quad (g,s,p,o) 
   SELECT IRI_TO_ID (&#39;psmoosh&#39;), 
          ni_s, 
          np_p, 
 COALESCE ( ( SELECT pref 
              FROM pref_iri 
              WHERE i = np_o
            ), 
            np_o 
          )
     FROM name_prop, 
          name_iri 
    WHERE ni_name = np_name 
   OPTION ( loop, quietcast ) ;<br />
-- A little explanation:  The properties of names are copied into rdf_quad with the name 
-- replaced with its canonical IRI.  If the object has a canonical IRI, this is used as 
-- the object, else the object is unmodified.  This is the COALESCE with the sub-query.<br />
-- This takes a little time.  To check on the progress, take another connection to the 
-- server and do <br />
STATUS (&#39;cluster&#39;);<br />
-- It will return something like 
-- Cluster 4 nodes, 35 s. 108 m/s 1001 KB/s  75% cpu 186%  read 12% clw threads 5r 0w 0i 
-- buffers 549481 253929 d 8 w 0 pfs<br />
-- Now finalize the state; this makes it permanent.  Else the work will be lost on server 
-- failure, since there was no transaction log <br />
CL_EXEC (&#39;checkpoint&#39;);<br />
-- See what we got<br />
sparql 
   SELECT COUNT (*) 
     FROM &lt;psmoosh&gt; 
     WHERE {?s ?p ?o};<br />
-- This is 2253102<br />
-- Now make the copy where all have the properties of all synonyms.  This takes so much 
-- space we do not insert it as RDF quads, but make a special table for it so that we can 
-- run some statistics.  This saves time.<br />
INSERT SOFT smoosh_ct (s, p, o)  
   SELECT s, np_p, np_o 
     FROM name_prop, 
          rdf_quad 
    WHERE o = np_name 
          AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- as above, INSERT SOFT so as to ignore duplicates <br />
SELECT COUNT (*) 
   FROM smoosh_ct;<br />
-- This is  167360324<br />
-- Find out where the bloat comes from <br />
SELECT TOP 20 COUNT (*), 
              ID_TO_IRI (p) 
   FROM smoosh_ct 
   GROUP BY p 
   ORDER BY 1 DESC;
</pre></blockquote>
<p>The results are:</p>

<blockquote>
<pre>54728777          http://www.w3.org/2002/07/owl#sameAs
48543153          http://xmlns.com/foaf/0.1/knows
13930234          http://www.w3.org/2000/01/rdf-schema#seeAlso
12268512          http://xmlns.com/foaf/0.1/interest
11415867          http://xmlns.com/foaf/0.1/nick
6683963           http://xmlns.com/foaf/0.1/weblog
6650093           http://xmlns.com/foaf/0.1/depiction
4231946           http://xmlns.com/foaf/0.1/mbox_sha1sum
4129629           http://xmlns.com/foaf/0.1/homepage
1776555           http://xmlns.com/foaf/0.1/holdsAccount
1219525           http://xmlns.com/foaf/0.1/based_near
305522            http://www.w3.org/1999/02/22-rdf-syntax-ns#type
274965            http://xmlns.com/foaf/0.1/name
155131            http://xmlns.com/foaf/0.1/dateOfBirth
153001            http://xmlns.com/foaf/0.1/img
111130            http://www.w3.org/2001/vcard-rdf/3.0#ADR
52930             http://xmlns.com/foaf/0.1/gender
48517             http://www.w3.org/2004/02/skos/core#subject
45697             http://www.w3.org/2000/01/rdf-schema#label
44860             http://purl.org/vocab/bio/0.1/olb
</pre></blockquote>

<p>Now compare with the predicate distribution of the smoosh with identities canonicalized </p>

<blockquote>
<pre>sparql 
     SELECT COUNT (*) ?p 
       FROM &lt;psmoosh&gt; 
      WHERE { ?s ?p ?o } 
   GROUP BY ?p 
   ORDER BY 1 DESC 
      LIMIT 20;</pre></blockquote>

<p>Results are:</p>
<blockquote>
<pre>748311            http://xmlns.com/foaf/0.1/knows
548391            http://xmlns.com/foaf/0.1/interest
140531            http://www.w3.org/2000/01/rdf-schema#seeAlso
105273            http://www.w3.org/1999/02/22-rdf-syntax-ns#type
78497             http://xmlns.com/foaf/0.1/name
48099             http://www.w3.org/2004/02/skos/core#subject
45179             http://xmlns.com/foaf/0.1/depiction
40229             http://www.w3.org/2000/01/rdf-schema#comment
38272             http://www.w3.org/2000/01/rdf-schema#label
37378             http://xmlns.com/foaf/0.1/nick
37186             http://dbpedia.org/property/abstract
34003             http://xmlns.com/foaf/0.1/img
26182             http://xmlns.com/foaf/0.1/homepage
23795             http://www.w3.org/2002/07/owl#sameAs
17651             http://xmlns.com/foaf/0.1/mbox_sha1sum
17430             http://xmlns.com/foaf/0.1/dateOfBirth
15586             http://xmlns.com/foaf/0.1/page
12869             http://dbpedia.org/property/reference
12497             http://xmlns.com/foaf/0.1/weblog
12329             http://blogs.yandex.ru/schema/foaf/school
</pre></blockquote>

<p>We can drop the <code>owl:sameAs</code> triples from the count, so the bloat is a bit less by that but it still is tens of times larger than the canonicalized copy or the initial state.</p>

<p>Now, when we try using the psmoosh graph, we still get different results from the results with the original data.  This is because <code>foaf:knows</code> relations to things with no <code>foaf:name</code> are not represented in the smoosh.  The exist:</p>

<blockquote>
<pre>sparql 
SELECT COUNT (*) 
   WHERE { ?s foaf:knows ?thing . 
           FILTER ( !bif:exists ( SELECT (1) 
                                   WHERE { ?thing foaf:name ?nn }
                                )
                  ) 
         };<br />
-- 1393940
</pre></blockquote>

<p>So the smoosh graph is not an accurate rendition of the social network.  It would have to be smooshed further to be that, since the data in the sample is quite irregular.  But we do not go that far here.</p>

<p>Finally, we calculate the smoosh blow up factors.  We do not include <code>owl:sameAs</code> triples in the counts.</p>

<blockquote>
<pre>select (167360324 - 54728777) / 3284674.0;
34.290022997716059<br />
select 2229307 / 3284674.0;
= 0.678699621332284
</pre></blockquote>

<p>So, to get a smoosh that is not really the equivalent of the original, either multiply the original triple count by 34 or 0.68, depending on whether synonyms are collapsed or not.</p>

<p>Making the smooshes does not take very long, some minutes for the small one.  Inserting the big one would be longer, a couple of hours maybe.  It was 33 minutes for filling the <code>smoosh_ct</code> table.  The metrics were not with optimal tuning so the performance numbers just serve to show that smooshing takes time.  Probably more time than allowable in an interactive situation, no matter how the process is optimized.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-12-16#1498">
  <rss:title>&quot;E Pluribus Unum&quot;, or &quot;Inversely Functional Identity&quot;, or &quot;Smooshing Without the Stickiness&quot; (re-updated)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-12-16T14:14:43Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">What a terrible word, smooshing... I have understood it to mean that when you have two names for one thing, you give each all the attributes of the other. This smooshes them together, makes them interchangeable. This is complex, so I will begin with the point and the interested may read on for the details and implications. Starting with soon to be released version 6, Virtuoso allows you to say that two things, if they share a uniquely identifying property, are the same. Examples of uniquely identifying properties would be a book&#39;s ISBN number, or a person&#39;s social security plus full name. In relational language this is a unique key, and in RDF parlance, an inverse functional property. In most systems, such problems are dealt with as a preprocessing step before querying. For example, all the items that are considered the same will get the same properties or at load time all identifiers will be normalized according to some application rules. This is good if the rules are clear and understood. This is so in closed situations, where things tend to have standard identifiers to begin with. But on the open web this is not so clear cut. In this post, we show how to do these things ad hoc, without materializing anything. At the end, we also show how to materialize identity and what the consequences of this are with open web data. We use real live web crawls from the Billion Triples Challenge data set. On the linked data web, there are independently arising descriptions of the same thing and thus arises the need to smoosh, if these are to be somehow integrated. But this is only the beginning of the problems. To address these, we have added the option of specifying that some property will be considered inversely functional in a query. This is done at run time and the property does not really have to be inversely functional in the pure sense. foaf:name will do for an example. This simply means that for purposes of the query concerned, two subjects which have at least one foaf:name in common are considered the same. In this way, we can join between FOAF files. With the same database, a query about music preferences might consider having the same name as &quot;same enough,&quot; but a query about criminal prosecution would obviously need to be more precise about sameness. Our ontology is defined like this: -- Populate a named graph with the triples you want to use in query time inferencing ttlp ( &#39; @prefix foaf: &lt;xmlns=&quot;http&quot; xmlns.com=&quot;xmlns.com&quot; foaf=&quot;foaf&quot;&gt; &lt;/&gt; @prefix owl: &lt;xmlns=&quot;http&quot; www.w3.org=&quot;www.w3.org&quot; owl=&quot;owl&quot;&gt; &lt;/&gt; foaf:mbox_sha1sum a owl:InverseFunctionalProperty . foaf:name a owl:InverseFunctionalProperty . &#39;, &#39;xx&#39;, &#39;b3sifp&#39; ); -- Declare that the graph contains an ontology for use in query time inferencing rdfs_rule_set ( &#39;http://example.com/rules/b3sifp#&#39;, &#39;b3sifp&#39; ); Then use it: sparql DEFINE input:inference &quot;http://example.com/rules/b3sifp#&quot; SELECT DISTINCT ?k ?f1 ?f2 WHERE { ?k foaf:name ?n . ?n bif:contains &quot;&#39;Kjetil Kjernsmo&#39;&quot; . ?k foaf:knows ?f1 . ?f1 foaf:knows ?f2 }; VARCHAR VARCHAR VARCHAR ______________________________________ _______________________________________________ ______________________________ http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/dajobe http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/net_twitter http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/amyvdh http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/pom http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/mattb http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/davorg http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/distobj http://www.kjetil.kjernsmo.net/foaf#me http://norman.walsh.name/knows/who/robin-berjon http://twitter.com/perigrin .... Without the inference, we get no matches. This is because the data in question has one graph per FOAF file, and blank nodes for persons. No graph references any person outside the ones in the graph. So if somebody is mentioned as known, then without the inference there is no way to get to what that person&#39;s FOAF file says, since the same individual will be a different blank node there. The declaration in the context named b3sifp just means that all things with a matching foaf:name or foaf:mbox_sha1sum are the same. Sameness means that two are the same for purposes of DISTINCT or GROUP BY, and if two are the same, then both have the UNION of all of the properties of both. If this were a naive smoosh, then the individuals would have all the same properties but would not be the same for DISTINCT. If we have complex application rules for determining whether individuals are the same, then one can materialize owl:sameAs triples and keep them in a separate graph. In this way, the original data is not contaminated and the materialized volume stays reasonable â nothing like the blow-up of duplicating properties across instances. The pro-smoosh argument is that if every duplicate makes exactly the same statements, then there is no great blow-up. Best and worst cases will always depend on the data. In rough terms, the more ad hoc the use, the less desirable the materialization. If the usage pattern is really set, then a relational-style application-specific representation with identity resolved at load time will perform best. We can do that too, but so can others. The principal point is about agility as concerns the inference. Run time is more agile than materialization, and if the rules change or if different users have different needs, then materialization runs into trouble. When talking web scale, having multiple users is a given; it is very uneconomical to give everybody their own copy, and the likelihood of a user accessing any significant part of the corpus is minimal. Even if the queries were not limited, the user would typically not wait for the answer of a query doing a scan or aggregation over 1 billion blog posts or something of the sort. So queries will typically be selective. Selective means that they do not access all of the data, hence do not benefit from ready-made materialization for things they do not even look at. The exception is corpus-wide statistics queries. But these will not be done in interactive time anyway, and will not be done very often. Plus, since these do not typically run all in memory, these are disk bound. And when things are disk bound, size matters. Reading extra entailment on the way is just a performance penalty. Enough talk. Time for an experiment. We take the Yahoo and Falcon web crawls from the Billion Triples Challenge set, and do two things with the FOAF data in them: Resolve identity at insert time. We remove duplicate person URIs, and give the single URI all the properties of all the duplicate URIs. We expect these to be most often repeats. If a person references another person, we normalize this reference to go to the single URI of the referenced person. Give every duplicate URI of a person all the properties of all the duplicates. If these are the same value, the data should not get much bigger, or so we think. For the experiment, we will consider two people the same if they have the same foaf:name and are both instances of foaf:Person. This gets some extra hits but should not be statistically significant. The following is a commented SQL script performing the smoosh. We play with internal IDs of things, thus some of these operations cannot be done in SPARQL alone. We use SPARQL where possible for readability. As the documentation states, iri_to_id converts from the qualified name of an IRI to its ID and id_to_iri does the reverse. We count the triples that enter into the smoosh: -- the name is an existence because else we&#39;d get several times more due to -- the names occurring in many graphs sparql SELECT COUNT(*) WHERE { { SELECT DISTINCT ?person WHERE { ?person a foaf:Person } } . FILTER ( bif:exists ( SELECT (1) WHERE { ?person foaf:name ?nn } ) ) . ?person ?p ?o }; -- We get 3284674 We make a few tables for intermediate results. -- For each distinct name, gather the properties and objects from -- all subjects with this name CREATE TABLE name_prop ( np_name ANY, np_p IRI_ID_8, np_o ANY, PRIMARY KEY ( np_name, np_p, np_o ) ); ALTER INDEX name_prop ON name_prop PARTITION ( np_name VARCHAR (-1, 0hexffff) ); -- Map from name to canonical IRI used for the name CREATE TABLE name_iri ( ni_name ANY PRIMARY KEY, ni_s IRI_ID_8 ); ALTER INDEX name_iri ON name_iri PARTITION ( ni_name VARCHAR (-1, 0hexffff) ); -- Map from person IRI to canonical person IRI CREATE TABLE pref_iri ( i IRI_ID_8, pref IRI_ID_8, PRIMARY KEY ( i ) ); ALTER INDEX pref_iri ON pref_iri PARTITION ( i INT (0hexffff00) ); -- a table for the materialization where all aliases get all properties of every other CREATE TABLE smoosh_ct ( s IRI_ID_8, p IRI_ID_8, o ANY, PRIMARY KEY ( s, p, o ) ); ALTER INDEX smoosh_ct ON smoosh_ct PARTITION ( s INT (0hexffff00) ); -- disable transaction log and enable row auto-commit. This is necessary, otherwise -- bulk operations are done transactionally and they will run out of rollback space. LOG_ENABLE (2); -- Gather all the properties of all persons with a name under that name. -- INSERT SOFT means that duplicates are ignored INSERT SOFT name_prop SELECT &quot;n&quot;, &quot;p&quot;, &quot;o&quot; FROM ( sparql DEFINE output:valmode &quot;LONG&quot; SELECT ?n ?p ?o WHERE { ?x a foaf:Person . ?x foaf:name ?n . ?x ?p ?o } ) xx ; -- Now choose for each name the canonical IRI INSERT INTO name_iri SELECT np_name, ( SELECT MIN (s) FROM rdf_quad WHERE o = np_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ) AS mini FROM name_prop WHERE np_p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- For each person IRI, map to the canonical IRI of that person INSERT SOFT pref_iri (i, pref) SELECT s, ni_s FROM name_iri, rdf_quad WHERE o = ni_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- Make a graph where all persons have one iri with all the properties of all aliases -- and where person-to-person refs are canonicalized INSERT SOFT rdf_quad (g,s,p,o) SELECT IRI_TO_ID (&#39;psmoosh&#39;), ni_s, np_p, COALESCE ( ( SELECT pref FROM pref_iri WHERE i = np_o ), np_o ) FROM name_prop, name_iri WHERE ni_name = np_name OPTION ( loop, quietcast ) ; -- A little explanation: The properties of names are copied into rdf_quad with the name -- replaced with its canonical IRI. If the object has a canonical IRI, this is used as -- the object, else the object is unmodified. This is the COALESCE with the sub-query. -- This takes a little time. To check on the progress, take another connection to the -- server and do STATUS (&#39;cluster&#39;); -- It will return something like -- Cluster 4 nodes, 35 s. 108 m/s 1001 KB/s 75% cpu 186% read 12% clw threads 5r 0w 0i -- buffers 549481 253929 d 8 w 0 pfs -- Now finalize the state; this makes it permanent. Else the work will be lost on server -- failure, since there was no transaction log CL_EXEC (&#39;checkpoint&#39;); -- See what we got sparql SELECT COUNT (*) FROM &lt;psmoosh&gt; WHERE {?s ?p ?o}; -- This is 2253102 -- Now make the copy where all have the properties of all synonyms. This takes so much -- space we do not insert it as RDF quads, but make a special table for it so that we can -- run some statistics. This saves time. INSERT SOFT smoosh_ct (s, p, o) SELECT s, np_p, np_o FROM name_prop, rdf_quad WHERE o = np_name AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ; -- as above, INSERT SOFT so as to ignore duplicates SELECT COUNT (*) FROM smoosh_ct; -- This is 167360324 -- Find out where the bloat comes from SELECT TOP 20 COUNT (*), ID_TO_IRI (p) FROM smoosh_ct GROUP BY p ORDER BY 1 DESC; The results are: 54728777 http://www.w3.org/2002/07/owl#sameAs 48543153 http://xmlns.com/foaf/0.1/knows 13930234 http://www.w3.org/2000/01/rdf-schema#seeAlso 12268512 http://xmlns.com/foaf/0.1/interest 11415867 http://xmlns.com/foaf/0.1/nick 6683963 http://xmlns.com/foaf/0.1/weblog 6650093 http://xmlns.com/foaf/0.1/depiction 4231946 http://xmlns.com/foaf/0.1/mbox_sha1sum 4129629 http://xmlns.com/foaf/0.1/homepage 1776555 http://xmlns.com/foaf/0.1/holdsAccount 1219525 http://xmlns.com/foaf/0.1/based_near 305522 http://www.w3.org/1999/02/22-rdf-syntax-ns#type 274965 http://xmlns.com/foaf/0.1/name 155131 http://xmlns.com/foaf/0.1/dateOfBirth 153001 http://xmlns.com/foaf/0.1/img 111130 http://www.w3.org/2001/vcard-rdf/3.0#ADR 52930 http://xmlns.com/foaf/0.1/gender 48517 http://www.w3.org/2004/02/skos/core#subject 45697 http://www.w3.org/2000/01/rdf-schema#label 44860 http://purl.org/vocab/bio/0.1/olb Now compare with the predicate distribution of the smoosh with identities canonicalized sparql SELECT COUNT (*) ?p FROM &lt;psmoosh&gt; WHERE { ?s ?p ?o } GROUP BY ?p ORDER BY 1 DESC LIMIT 20; Results are: 748311 http://xmlns.com/foaf/0.1/knows 548391 http://xmlns.com/foaf/0.1/interest 140531 http://www.w3.org/2000/01/rdf-schema#seeAlso 105273 http://www.w3.org/1999/02/22-rdf-syntax-ns#type 78497 http://xmlns.com/foaf/0.1/name 48099 http://www.w3.org/2004/02/skos/core#subject 45179 http://xmlns.com/foaf/0.1/depiction 40229 http://www.w3.org/2000/01/rdf-schema#comment 38272 http://www.w3.org/2000/01/rdf-schema#label 37378 http://xmlns.com/foaf/0.1/nick 37186 http://dbpedia.org/property/abstract 34003 http://xmlns.com/foaf/0.1/img 26182 http://xmlns.com/foaf/0.1/homepage 23795 http://www.w3.org/2002/07/owl#sameAs 17651 http://xmlns.com/foaf/0.1/mbox_sha1sum 17430 http://xmlns.com/foaf/0.1/dateOfBirth 15586 http://xmlns.com/foaf/0.1/page 12869 http://dbpedia.org/property/reference 12497 http://xmlns.com/foaf/0.1/weblog 12329 http://blogs.yandex.ru/schema/foaf/school We can drop the owl:sameAs triples from the count, so the bloat is a bit less by that but it still is tens of times larger than the canonicalized copy or the initial state. Now, when we try using the psmoosh graph, we still get different results from the results with the original data. This is because foaf:knows relations to things with no foaf:name are not represented in the smoosh. The exist: sparql SELECT COUNT (*) WHERE { ?s foaf:knows ?thing . FILTER ( !bif:exists ( SELECT (1) WHERE { ?thing foaf:name ?nn } ) ) }; -- 1393940 So the smoosh graph is not an accurate rendition of the social network. It would have to be smooshed further to be that, since the data in the sample is quite irregular. But we do not go that far here. Finally, we calculate the smoosh blow up factors. We do not include owl:sameAs triples in the counts. select (167360324 - 54728777) / 3284674.0; 34.290022997716059 select 2229307 / 3284674.0; = 0.678699621332284 So, to get a smoosh that is not really the equivalent of the original, either multiply the original triple count by 34 or 0.68, depending on whether synonyms are collapsed or not. Making the smooshes does not take very long, some minutes for the small one. Inserting the big one would be longer, a couple of hours maybe. It was 33 minutes for filling the smoosh_ct table. The metrics were not with optimal tuning so the performance numbers just serve to show that smooshing takes time. Probably more time than allowable in an interactive situation, no matter how the process is optimized.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>What a terrible word, smooshing...  I have understood it to mean that when you have two names for one thing, you give each all the attributes of the other.  This smooshes them together, makes them interchangeable.</p>

<p>This is complex, so I will begin with the point and the interested may read on for the details and implications.  Starting with soon to be released version 6, <a href="http://virtuoso.openlinksw.com" id="link-id15718cb8">Virtuoso</a> allows you to say that two things, if they share a uniquely identifying property, are the same.  Examples of uniquely identifying properties would be a book&#39;s ISBN number, or a person&#39;s social security plus full name.  In relational language this is a <i>unique key</i>, and in <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id145ed998">RDF</a> parlance, an <i>inverse functional property</i>.</p>

<p>In most systems, such problems are dealt with as a preprocessing step before querying.  For example, all the items that are considered the same will get the same properties or at load time all identifiers will be normalized according to some application rules.  This is good if the rules are clear and understood.  This is so in closed situations, where things tend to have standard identifiers to begin with.  But on the open web this is not so clear cut.</p>

<p>In this post, we show how to do these things <i>ad hoc</i>, without materializing anything.  At the end, we also show how to materialize identity and what the consequences of this are with open web <a href="http://dbpedia.org/resource/Data" id="link-id11726358">data</a>.  We use real live web crawls from the <a href="http://challenge.semanticweb.org/" id="link-id14f40448">Billion Triples Challenge</a> data set.</p>

<p>On the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id156e2b10">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id1106ce08">web</a>, there are independently arising descriptions of the same thing and thus arises the need to smoosh, if these are to be somehow integrated.  But this is only the beginning of the problems.</p>

<p>To address these, we have added the option of specifying that some property will be considered inversely functional in a query.  This is done at run time and the property does not really have to be inversely functional in the pure sense.  <code>foaf:name</code> will do for an example.  This simply means that for purposes of the query concerned, two subjects which have at least one <code>foaf:name</code> in common are considered the same. In this way, we can join between FOAF files.  With the same database, a query about music preferences might consider having the same name as &quot;same enough,&quot; but a query about criminal prosecution would obviously need to be more precise about sameness.</p>

<p>Our ontology is defined like this:</p>

<blockquote>
<pre>-- Populate a named graph with the triples you want to use in query time inferencing<br />
ttlp ( &#39;
        @prefix foaf: &lt;xmlns=&quot;http&quot; xmlns.com=&quot;xmlns.com&quot; foaf=&quot;foaf&quot;&gt;
                      &lt;/&gt;
        @prefix owl:  &lt;xmlns=&quot;http&quot; www.w3.org=&quot;www.w3.org&quot; owl=&quot;owl&quot;&gt;
                      &lt;/&gt;
        foaf:mbox_sha1sum  a  owl:InverseFunctionalProperty  .
        foaf:name          a  owl:InverseFunctionalProperty  .
       &#39;,
       &#39;xx&#39;,
       &#39;b3sifp&#39;
     );<br />
-- Declare that the graph contains an ontology for use in query time inferencing <br />
rdfs_rule_set ( &#39;http://example.com/rules/b3sifp#&#39;,
                &#39;b3sifp&#39;
              );
</pre></blockquote>

<p>Then use it:</p>

<blockquote>
<pre>sparql 
   DEFINE input:inference &quot;http://example.com/rules/b3sifp#&quot; 
   SELECT DISTINCT ?k ?f1 ?f2 
   WHERE { ?k   foaf:name     ?n                   . 
           ?n   bif:contains  &quot;&#39;Kjetil Kjernsmo&#39;&quot;  . 
           ?k   foaf:knows    ?f1                  . 
           ?f1  foaf:knows    ?f2 
         };<br />
VARCHAR                                  VARCHAR                                           VARCHAR
______________________________________   _______________________________________________   ______________________________<br />
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/dajobe
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/net_twitter
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/amyvdh
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/pom
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/mattb
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/davorg
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/distobj
http://www.kjetil.kjernsmo.net/foaf#me   http://norman.walsh.name/knows/who/robin-berjon   http://twitter.com/perigrin
....
</pre></blockquote>

<p>Without the inference, we get no matches.  This is because the data in question has one graph per FOAF file, and blank nodes for persons.  No graph references any person outside the ones in the graph.  So if somebody is mentioned as known, then without the inference there is no way to get to what that person&#39;s FOAF file says, since the same individual will be a different blank node there.  The declaration in the context named <code>b3sifp</code> just means that all things with a matching <code>foaf:name</code> or <code>foaf:mbox_sha1sum</code> are the same.</p>

<p>Sameness means that two are the same for purposes of <code>DISTINCT</code> or <code>GROUP BY</code>, and if two are the same, then both have the <code>UNION</code> of all of the properties of both.</p>

<p>If this were a naive smoosh, then the individuals would have all the same properties but would not be the same for <code>DISTINCT</code>.</p>

<p>If we have complex application rules for determining whether individuals are the same, then one can materialize <code>owl:sameAs</code> triples and keep them in a separate graph.  In this way, the original data is not contaminated and the materialized volume stays reasonable â nothing like the blow-up of duplicating properties across instances.</p>

<p>The pro-smoosh argument is that if every duplicate makes exactly the same statements, then there is no great blow-up.  Best and worst cases will always depend on the data.  In rough terms, the more <i>ad hoc</i> the use, the less desirable the materialization.  If the usage pattern is really set, then a relational-style application-specific representation with identity resolved at load time will perform best.  We can do that too, but so can others.</p>

<p>The principal point is about agility as concerns the inference.  Run time is more agile than materialization, and if the rules change or if different users have different needs, then materialization runs into trouble.  When talking web scale, having multiple users is a given; it is very uneconomical to give everybody their own copy, and the likelihood of a user accessing any significant part of the corpus is minimal.  Even if the queries were not limited, the user would typically not wait for the answer of a query doing a scan or aggregation over 1 billion <a href="http://dbpedia.org/resource/Blog" id="link-id1156a550">blog</a> posts or something of the sort.  So queries will typically be selective.  Selective means that they do not access all of the data, hence do not benefit from ready-made materialization for things they do not even look at. </p>

<p>The exception is corpus-wide statistics queries.  But these will not be done in interactive time anyway, and will not be done very often. Plus, since these do not typically run all in memory, these are disk bound.  And when things are disk bound, size matters.  Reading extra entailment on the way is just a performance penalty.</p>

<p>Enough talk. Time for an experiment.  We take the Yahoo and Falcon web crawls from the Billion Triples Challenge set, and do two things with the FOAF data in them:</p>

<ol>
<li>Resolve identity at insert time.  We remove duplicate person URIs, and give the single <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id11317008">URI</a> all the properties of all the duplicate URIs.  We expect these to be most often repeats.  If a person references another person, we normalize this reference to go to the single URI of the referenced person.</li>

<li>Give every duplicate URI of a person all the properties of all the duplicates.  If these are the same value, the data should not get much bigger, or so we think.</li>
</ol>

<p>For the experiment, we will consider two people the same if they have the same <code>foaf:name</code> and are both instances of <code>foaf:Person</code>.  This gets some extra hits but should not be statistically significant.</p>

<p>The following is a commented <a href="http://dbpedia.org/resource/SQL" id="link-id110945b0">SQL</a> script performing the smoosh.  We play with internal IDs of things, thus some of these operations cannot be done in SPARQL alone.  We use SPARQL where possible for readability.  As the documentation states, <code>iri_to_id</code> converts from the qualified name of an IRI to its ID and <code>id_to_iri</code> does the reverse.</p>

<p>We count the triples that enter into the smoosh:</p>

<blockquote>
<pre>-- the name is an existence because else we&#39;d get several times more due to 
-- the names occurring in many graphs <br />
sparql 
   SELECT COUNT(*) 
    WHERE { { SELECT DISTINCT ?person 
               WHERE { ?person a foaf:Person }
            } . 
            FILTER ( bif:exists ( SELECT (1) 
                                   WHERE { ?person foaf:name ?nn } 
                                )
                       ) . 
            ?person ?p ?o
          };<br />
-- We get 3284674
</pre></blockquote>

<p>We make a few tables for intermediate results.</p>

<blockquote>
<pre>-- For each distinct name, gather the properties and objects from 
-- all subjects with this name <br />
CREATE TABLE name_prop 
   ( np_name  ANY, 
     np_p     IRI_ID_8, 
     np_o     ANY, 
     PRIMARY KEY ( np_name, 
                   np_p, 
                   np_o
                 )
   );
ALTER INDEX name_prop 
   ON name_prop 
   PARTITION ( np_name VARCHAR (-1, 0hexffff) );<br />
-- Map from name to canonical IRI used for the name <br />
CREATE TABLE name_iri ( ni_name  ANY PRIMARY KEY, 
                        ni_s     IRI_ID_8
                      );
ALTER INDEX name_iri 
   ON name_iri 
   PARTITION ( ni_name VARCHAR (-1, 0hexffff) );<br />
-- Map from person IRI to canonical person IRI<br />
CREATE TABLE pref_iri 
   ( i     IRI_ID_8, 
     pref  IRI_ID_8, 
     PRIMARY KEY ( i )
   );
ALTER INDEX pref_iri 
   ON pref_iri 
   PARTITION ( i INT (0hexffff00) );<br />
-- a table for the materialization where all aliases get all properties of every other <br />
CREATE TABLE smoosh_ct 
   ( s  IRI_ID_8, 
     p  IRI_ID_8, 
     o  ANY, 
     PRIMARY KEY ( s, 
                   p, 
                   o
                 ) 
   );
ALTER INDEX smoosh_ct 
   ON smoosh_ct 
   PARTITION ( s INT (0hexffff00) );<br />
-- disable transaction log and enable row auto-commit.  This is necessary, otherwise 
-- bulk operations are done transactionally and they will run out of rollback space.<br />
LOG_ENABLE (2);<br />
-- Gather all the properties of all persons with a name under that name.  
-- INSERT SOFT means that duplicates are ignored <br />
INSERT SOFT name_prop 
   SELECT &quot;n&quot;, &quot;p&quot;, &quot;o&quot; 
   FROM ( sparql 
          DEFINE output:valmode &quot;LONG&quot; 
          SELECT ?n ?p ?o 
          WHERE { ?x a foaf:Person . 
                 ?x foaf:name ?n . 
                 ?x ?p ?o
               }
        ) xx ;<br />
-- Now choose for each name the canonical IRI <br />
INSERT INTO name_iri 
   SELECT np_name, 
          ( SELECT MIN (s) 
              FROM rdf_quad 
             WHERE o = np_name 
                   AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;)
          ) AS mini 
     FROM name_prop 
    WHERE np_p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- For each person IRI, map to the canonical IRI of that person <br />
INSERT SOFT pref_iri (i, pref) 
   SELECT s, 
          ni_s 
     FROM name_iri, 
          rdf_quad 
    WHERE o = ni_name 
          AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- Make a graph where all persons have one iri with all the properties of all aliases 
-- and where person-to-person refs are canonicalized<br />
INSERT SOFT rdf_quad (g,s,p,o) 
   SELECT IRI_TO_ID (&#39;psmoosh&#39;), 
          ni_s, 
          np_p, 
 COALESCE ( ( SELECT pref 
              FROM pref_iri 
              WHERE i = np_o
            ), 
            np_o 
          )
     FROM name_prop, 
          name_iri 
    WHERE ni_name = np_name 
   OPTION ( loop, quietcast ) ;<br />
-- A little explanation:  The properties of names are copied into rdf_quad with the name 
-- replaced with its canonical IRI.  If the object has a canonical IRI, this is used as 
-- the object, else the object is unmodified.  This is the COALESCE with the sub-query.<br />
-- This takes a little time.  To check on the progress, take another connection to the 
-- server and do <br />
STATUS (&#39;cluster&#39;);<br />
-- It will return something like 
-- Cluster 4 nodes, 35 s. 108 m/s 1001 KB/s  75% cpu 186%  read 12% clw threads 5r 0w 0i 
-- buffers 549481 253929 d 8 w 0 pfs<br />
-- Now finalize the state; this makes it permanent.  Else the work will be lost on server 
-- failure, since there was no transaction log <br />
CL_EXEC (&#39;checkpoint&#39;);<br />
-- See what we got<br />
sparql 
   SELECT COUNT (*) 
     FROM &lt;psmoosh&gt; 
     WHERE {?s ?p ?o};<br />
-- This is 2253102<br />
-- Now make the copy where all have the properties of all synonyms.  This takes so much 
-- space we do not insert it as RDF quads, but make a special table for it so that we can 
-- run some statistics.  This saves time.<br />
INSERT SOFT smoosh_ct (s, p, o)  
   SELECT s, np_p, np_o 
     FROM name_prop, 
          rdf_quad 
    WHERE o = np_name 
          AND p = IRI_TO_ID (&#39;http://xmlns.com/foaf/0.1/name&#39;) ;<br />
-- as above, INSERT SOFT so as to ignore duplicates <br />
SELECT COUNT (*) 
   FROM smoosh_ct;<br />
-- This is  167360324<br />
-- Find out where the bloat comes from <br />
SELECT TOP 20 COUNT (*), 
              ID_TO_IRI (p) 
   FROM smoosh_ct 
   GROUP BY p 
   ORDER BY 1 DESC;
</pre></blockquote>
<p>The results are:</p>

<blockquote>
<pre>54728777          http://www.w3.org/2002/07/owl#sameAs
48543153          http://xmlns.com/foaf/0.1/knows
13930234          http://www.w3.org/2000/01/rdf-schema#seeAlso
12268512          http://xmlns.com/foaf/0.1/interest
11415867          http://xmlns.com/foaf/0.1/nick
6683963           http://xmlns.com/foaf/0.1/weblog
6650093           http://xmlns.com/foaf/0.1/depiction
4231946           http://xmlns.com/foaf/0.1/mbox_sha1sum
4129629           http://xmlns.com/foaf/0.1/homepage
1776555           http://xmlns.com/foaf/0.1/holdsAccount
1219525           http://xmlns.com/foaf/0.1/based_near
305522            http://www.w3.org/1999/02/22-rdf-syntax-ns#type
274965            http://xmlns.com/foaf/0.1/name
155131            http://xmlns.com/foaf/0.1/dateOfBirth
153001            http://xmlns.com/foaf/0.1/img
111130            http://www.w3.org/2001/vcard-rdf/3.0#ADR
52930             http://xmlns.com/foaf/0.1/gender
48517             http://www.w3.org/2004/02/skos/core#subject
45697             http://www.w3.org/2000/01/rdf-schema#label
44860             http://purl.org/vocab/bio/0.1/olb
</pre></blockquote>

<p>Now compare with the predicate distribution of the smoosh with identities canonicalized </p>

<blockquote>
<pre>sparql 
     SELECT COUNT (*) ?p 
       FROM &lt;psmoosh&gt; 
      WHERE { ?s ?p ?o } 
   GROUP BY ?p 
   ORDER BY 1 DESC 
      LIMIT 20;</pre></blockquote>

<p>Results are:</p>
<blockquote>
<pre>748311            http://xmlns.com/foaf/0.1/knows
548391            http://xmlns.com/foaf/0.1/interest
140531            http://www.w3.org/2000/01/rdf-schema#seeAlso
105273            http://www.w3.org/1999/02/22-rdf-syntax-ns#type
78497             http://xmlns.com/foaf/0.1/name
48099             http://www.w3.org/2004/02/skos/core#subject
45179             http://xmlns.com/foaf/0.1/depiction
40229             http://www.w3.org/2000/01/rdf-schema#comment
38272             http://www.w3.org/2000/01/rdf-schema#label
37378             http://xmlns.com/foaf/0.1/nick
37186             http://dbpedia.org/property/abstract
34003             http://xmlns.com/foaf/0.1/img
26182             http://xmlns.com/foaf/0.1/homepage
23795             http://www.w3.org/2002/07/owl#sameAs
17651             http://xmlns.com/foaf/0.1/mbox_sha1sum
17430             http://xmlns.com/foaf/0.1/dateOfBirth
15586             http://xmlns.com/foaf/0.1/page
12869             http://dbpedia.org/property/reference
12497             http://xmlns.com/foaf/0.1/weblog
12329             http://blogs.yandex.ru/schema/foaf/school
</pre></blockquote>

<p>We can drop the <code>owl:sameAs</code> triples from the count, so the bloat is a bit less by that but it still is tens of times larger than the canonicalized copy or the initial state.</p>

<p>Now, when we try using the psmoosh graph, we still get different results from the results with the original data.  This is because <code>foaf:knows</code> relations to things with no <code>foaf:name</code> are not represented in the smoosh.  The exist:</p>

<blockquote>
<pre>sparql 
SELECT COUNT (*) 
   WHERE { ?s foaf:knows ?thing . 
           FILTER ( !bif:exists ( SELECT (1) 
                                   WHERE { ?thing foaf:name ?nn }
                                )
                  ) 
         };<br />
-- 1393940
</pre></blockquote>

<p>So the smoosh graph is not an accurate rendition of the social network.  It would have to be smooshed further to be that, since the data in the sample is quite irregular.  But we do not go that far here.</p>

<p>Finally, we calculate the smoosh blow up factors.  We do not include <code>owl:sameAs</code> triples in the counts.</p>

<blockquote>
<pre>select (167360324 - 54728777) / 3284674.0;
34.290022997716059<br />
select 2229307 / 3284674.0;
= 0.678699621332284
</pre></blockquote>

<p>So, to get a smoosh that is not really the equivalent of the original, either multiply the original triple count by 34 or 0.68, depending on whether synonyms are collapsed or not.</p>

<p>Making the smooshes does not take very long, some minutes for the small one.  Inserting the big one would be longer, a couple of hours maybe.  It was 33 minutes for filling the <code>smoosh_ct</code> table.  The metrics were not with optimal tuning so the performance numbers just serve to show that smooshing takes time.  Probably more time than allowable in an interactive situation, no matter how the process is optimized.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-12-11#1495">
  <rss:title>Virtuoso Anytime:  No Query Is Too Complex (updated)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-12-11T16:13:10Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">A persistent argument against the linked data web has been the cost, scalability, and vulnerability of SPARQL end points, should the linked data web gain serious mass and traffic. As we are on the brink of hosting the whole DBpedia Linked Open Data cloud in Virtuoso Cluster, we have had to think of what we&#39;ll do if, for example, somebody decides to count all the triples in the set. How can we encourage clever use of data, yet not die if somebody, whether through malice, lack of understanding, or simple bad luck, submits impossible queries? Restricting the language is not the way; any language beyond text search can express queries that will take forever to execute. Also, just returning a timeout after the first second (or whatever arbitrary time period) leaves people in the dark and does not produce an impression of responsiveness. So we decided to allow arbitrary queries, and if a quota of time or resources is exceeded, we return partial results and indicate how much processing was done. Here we are looking for the top 10 people whom people claim to know without being known in return, like this: SQL&gt; sparql SELECT ?celeb, COUNT (*) WHERE { ?claimant foaf:knows ?celeb . FILTER (!bif:exists ( SELECT (1) WHERE { ?celeb foaf:knows ?claimant } ) ) } GROUP BY ?celeb ORDER BY DESC 2 LIMIT 10; celeb callret-1 VARCHAR VARCHAR ________________________________________ _________ http://twitter.com/BarackObama 252 http://twitter.com/brianshaler 183 http://twitter.com/newmediajim 101 http://twitter.com/HenryRollins 95 http://twitter.com/wilw 81 http://twitter.com/stevegarfield 78 http://twitter.com/cote 66 mailto:adam.westerski@deri.org 66 mailto:michal.zaremba@deri.org 66 http://twitter.com/dsifry 65 *** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete results, query interrupted by result timeout. Activity: 1R rnd 0R seq 0P disk 1.346KB / 3 messages SQL&gt; sparql SELECT ?celeb, COUNT (*) WHERE { ?claimant foaf:knows ?celeb . FILTER (!bif:exists ( SELECT (1) WHERE { ?celeb foaf:knows ?claimant } ) ) } GROUP BY ?celeb ORDER BY DESC 2 LIMIT 10; celeb callret-1 VARCHAR VARCHAR ________________________________________ _________ http://twitter.com/JasonCalacanis 496 http://twitter.com/Twitterrific 466 http://twitter.com/ev 442 http://twitter.com/BarackObama 356 http://twitter.com/laughingsquid 317 http://twitter.com/gruber 294 http://twitter.com/chrispirillo 259 http://twitter.com/ambermacarthur 224 http://twitter.com/t 219 http://twitter.com/johnedwards 188 *** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete results, query interrupted by result timeout. Activity: 329R rnd 44.6KR seq 342P disk 638.4KB / 46 messages The first query read all data from disk; the second run had the working set from the first and could read some more before time ran out, hence the results were better. But the response time was the same. If one has a query that just loops over consecutive joins, like in basic SPARQL, interrupting the processing after a set time period is simple. But such queries are not very interesting. To give meaningful partial answers with nested aggregation and sub-queries requires some more tricks. The basic idea is to terminate the innermost active sub-query/aggregation at the first timeout, and extend the timeout a bit so that accumulated results get fed to the next aggregation, like from the GROUP BY to the ORDER BY. If this again times out, we continue with the next outer layer. This guarantees that results are delivered if there were any results found for which the query pattern is true. False results are not produced, except in cases where there is comparison with a count and the count is smaller than it would be with the full evaluation. One can also use this as a basis for paid services. The cutoff does not have to be time; it can also be in other units, making it insensitive to concurrent usage and variations of working set. This system will be deployed on our Billion Triples Challenge demo instance in a few days, after some more testing. When Virtuoso 6 ships, all LOD Cloud AMIs and OpenLink-hosted LOD Cloud SPARQL endpoints will have this enabled by default. (AMI users will be able to disable the feature, if desired.) The feature works with Virtuoso 6 in both single server and cluster deployment.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p>A persistent argument against the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1199d5f8">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id116f2730">web</a> has been the cost, scalability, and vulnerability of <a href="http://dbpedia.org/resource/SPARQL" id="link-id14e423c0">SPARQL</a> end points, should the linked data web gain serious mass and traffic.</p>

<p>As we are on the brink of hosting the whole <a href="http://dbpedia.org/resource/DBpedia" id="link-id1376a8b0">DBpedia</a> <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id113c8d20">Linked Open Data</a> cloud in <a href="http://virtuoso.openlinksw.com" id="link-id11425a78">Virtuoso</a> Cluster, we have had to think of what we&#39;ll do if, for example, somebody decides to count all the triples in the set.</p>

<p>How can we encourage clever use of <a href="http://dbpedia.org/resource/Data" id="link-id116f1210">data</a>, yet not die if somebody, whether through malice, lack of understanding, or simple bad luck, submits impossible queries?</p>

<p>Restricting the language is not the way; any language beyond text search can express queries that will take forever to execute.  Also, just returning a timeout after the first second (or whatever arbitrary time period) leaves people in the dark and does not produce an impression of responsiveness.  So we decided to allow arbitrary queries, and if a quota of time or resources is exceeded, we return partial results and indicate how much processing was done.</p>

<p>Here we are looking for the top 10 people whom people claim to know without being known in return, like this:</p>

<blockquote>
<pre>SQL&gt; sparql 
SELECT ?celeb, 
       COUNT (*)
WHERE { ?claimant foaf:knows ?celeb .
        FILTER (!bif:exists ( SELECT (1) 
                              WHERE { ?celeb foaf:knows ?claimant }
                            )
               )
      } 
GROUP BY ?celeb 
ORDER BY DESC 2 
LIMIT 10;<br />
celeb                                      callret-1
VARCHAR                                    VARCHAR
________________________________________   _________<br />
http://twitter.com/BarackObama             252
http://twitter.com/brianshaler             183
http://twitter.com/newmediajim             101
http://twitter.com/HenryRollins            95
http://twitter.com/wilw                    81
http://twitter.com/stevegarfield           78
http://twitter.com/cote                    66
mailto:adam.westerski@deri.org             66
mailto:michal.zaremba@deri.org             66
http://twitter.com/dsifry                  65<br />
*** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete 
results, query interrupted by result timeout.  
Activity:      1R rnd      0R seq      0P disk  1.346KB /      3 messages<br />
SQL&gt; sparql 
SELECT ?celeb, 
       COUNT (*)
WHERE { ?claimant foaf:knows ?celeb .
        FILTER (!bif:exists ( SELECT (1) 
                              WHERE { ?celeb foaf:knows ?claimant }
                            )
               )
      } 
GROUP BY ?celeb 
ORDER BY DESC 2 
LIMIT 10;<br />
celeb                                      callret-1
VARCHAR                                    VARCHAR
________________________________________   _________<br />
http://twitter.com/JasonCalacanis          496
http://twitter.com/Twitterrific            466
http://twitter.com/ev                      442
http://twitter.com/BarackObama             356
http://twitter.com/laughingsquid           317
http://twitter.com/gruber                  294
http://twitter.com/chrispirillo            259
http://twitter.com/ambermacarthur          224
http://twitter.com/t                       219
http://twitter.com/johnedwards             188<br />
*** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete 
results, query interrupted by result timeout.  
Activity:    329R rnd   44.6KR seq    342P disk  638.4KB /     46 messages</pre></blockquote>

<p>The first query read all data from disk; the second run had the working set from the first and could read some more before time ran out, hence the results were better.  But the response time was the same.</p>

<p>If one has a query that just loops over consecutive joins, like in basic SPARQL, interrupting the processing after a set time period is simple.  But such queries are not very interesting.  To give meaningful partial answers with nested aggregation and sub-queries requires some more tricks.  The basic idea is to terminate the innermost active sub-query/aggregation at the first timeout, and extend the timeout a bit so that accumulated results get fed to the next aggregation, like from the <code>GROUP BY</code> to the <code>ORDER BY</code>.  If this again times out, we continue with the next outer layer.  This guarantees that results are delivered if there were any results found for which the query pattern is true.  False results are not produced, except in cases where there is comparison with a count and the count is smaller than it would be with the full evaluation.</p>

<p>One can also use this as a basis for paid services.  The cutoff does not have to be time; it can also be in other units, making it insensitive to concurrent usage and variations of working set.</p>

<p>This system will be deployed on our <a href="http://challenge.semanticweb.org/" id="link-id11500a58">Billion Triples Challenge</a> <a href="http://b3s.openlinksw.com/" id="link-id11683120">demo instance</a> in a few days, after some more testing.  When Virtuoso 6 ships, all <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id1157a500">LOD</a> Cloud AMIs and OpenLink-hosted LOD Cloud SPARQL endpoints will have this enabled by default.  (AMI users will be able to disable the feature, if desired.)  The feature works with Virtuoso 6 in both single server and cluster deployment.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-12-11#1494">
  <rss:title>Virtuoso Anytime:  No Query Is Too Complex (updated)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-12-11T16:13:10Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">A persistent argument against the linked data web has been the cost, scalability, and vulnerability of SPARQL end points, should the linked data web gain serious mass and traffic. As we are on the brink of hosting the whole DBpedia Linked Open Data cloud in Virtuoso Cluster, we have had to think of what we&#39;ll do if, for example, somebody decides to count all the triples in the set. How can we encourage clever use of data, yet not die if somebody, whether through malice, lack of understanding, or simple bad luck, submits impossible queries? Restricting the language is not the way; any language beyond text search can express queries that will take forever to execute. Also, just returning a timeout after the first second (or whatever arbitrary time period) leaves people in the dark and does not produce an impression of responsiveness. So we decided to allow arbitrary queries, and if a quota of time or resources is exceeded, we return partial results and indicate how much processing was done. Here we are looking for the top 10 people whom people claim to know without being known in return, like this: SQL&gt; sparql SELECT ?celeb, COUNT (*) WHERE { ?claimant foaf:knows ?celeb . FILTER (!bif:exists ( SELECT (1) WHERE { ?celeb foaf:knows ?claimant } ) ) } GROUP BY ?celeb ORDER BY DESC 2 LIMIT 10; celeb callret-1 VARCHAR VARCHAR ________________________________________ _________ http://twitter.com/BarackObama 252 http://twitter.com/brianshaler 183 http://twitter.com/newmediajim 101 http://twitter.com/HenryRollins 95 http://twitter.com/wilw 81 http://twitter.com/stevegarfield 78 http://twitter.com/cote 66 mailto:adam.westerski@deri.org 66 mailto:michal.zaremba@deri.org 66 http://twitter.com/dsifry 65 *** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete results, query interrupted by result timeout. Activity: 1R rnd 0R seq 0P disk 1.346KB / 3 messages SQL&gt; sparql SELECT ?celeb, COUNT (*) WHERE { ?claimant foaf:knows ?celeb . FILTER (!bif:exists ( SELECT (1) WHERE { ?celeb foaf:knows ?claimant } ) ) } GROUP BY ?celeb ORDER BY DESC 2 LIMIT 10; celeb callret-1 VARCHAR VARCHAR ________________________________________ _________ http://twitter.com/JasonCalacanis 496 http://twitter.com/Twitterrific 466 http://twitter.com/ev 442 http://twitter.com/BarackObama 356 http://twitter.com/laughingsquid 317 http://twitter.com/gruber 294 http://twitter.com/chrispirillo 259 http://twitter.com/ambermacarthur 224 http://twitter.com/t 219 http://twitter.com/johnedwards 188 *** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete results, query interrupted by result timeout. Activity: 329R rnd 44.6KR seq 342P disk 638.4KB / 46 messages The first query read all data from disk; the second run had the working set from the first and could read some more before time ran out, hence the results were better. But the response time was the same. If one has a query that just loops over consecutive joins, like in basic SPARQL, interrupting the processing after a set time period is simple. But such queries are not very interesting. To give meaningful partial answers with nested aggregation and sub-queries requires some more tricks. The basic idea is to terminate the innermost active sub-query/aggregation at the first timeout, and extend the timeout a bit so that accumulated results get fed to the next aggregation, like from the GROUP BY to the ORDER BY. If this again times out, we continue with the next outer layer. This guarantees that results are delivered if there were any results found for which the query pattern is true. False results are not produced, except in cases where there is comparison with a count and the count is smaller than it would be with the full evaluation. One can also use this as a basis for paid services. The cutoff does not have to be time; it can also be in other units, making it insensitive to concurrent usage and variations of working set. This system will be deployed on our Billion Triples Challenge demo instance in a few days, after some more testing. When Virtuoso 6 ships, all LOD Cloud AMIs and OpenLink-hosted LOD Cloud SPARQL endpoints will have this enabled by default. (AMI users will be able to disable the feature, if desired.) The feature works with Virtuoso 6 in both single server and cluster deployment.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p>A persistent argument against the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1199d5f8">linked data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id116f2730">web</a> has been the cost, scalability, and vulnerability of <a href="http://dbpedia.org/resource/SPARQL" id="link-id14e423c0">SPARQL</a> end points, should the linked data web gain serious mass and traffic.</p>

<p>As we are on the brink of hosting the whole <a href="http://dbpedia.org/resource/DBpedia" id="link-id1376a8b0">DBpedia</a> <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id113c8d20">Linked Open Data</a> cloud in <a href="http://virtuoso.openlinksw.com" id="link-id11425a78">Virtuoso</a> Cluster, we have had to think of what we&#39;ll do if, for example, somebody decides to count all the triples in the set.</p>

<p>How can we encourage clever use of <a href="http://dbpedia.org/resource/Data" id="link-id116f1210">data</a>, yet not die if somebody, whether through malice, lack of understanding, or simple bad luck, submits impossible queries?</p>

<p>Restricting the language is not the way; any language beyond text search can express queries that will take forever to execute.  Also, just returning a timeout after the first second (or whatever arbitrary time period) leaves people in the dark and does not produce an impression of responsiveness.  So we decided to allow arbitrary queries, and if a quota of time or resources is exceeded, we return partial results and indicate how much processing was done.</p>

<p>Here we are looking for the top 10 people whom people claim to know without being known in return, like this:</p>

<blockquote>
<pre>SQL&gt; sparql 
SELECT ?celeb, 
       COUNT (*)
WHERE { ?claimant foaf:knows ?celeb .
        FILTER (!bif:exists ( SELECT (1) 
                              WHERE { ?celeb foaf:knows ?claimant }
                            )
               )
      } 
GROUP BY ?celeb 
ORDER BY DESC 2 
LIMIT 10;<br />
celeb                                      callret-1
VARCHAR                                    VARCHAR
________________________________________   _________<br />
http://twitter.com/BarackObama             252
http://twitter.com/brianshaler             183
http://twitter.com/newmediajim             101
http://twitter.com/HenryRollins            95
http://twitter.com/wilw                    81
http://twitter.com/stevegarfield           78
http://twitter.com/cote                    66
mailto:adam.westerski@deri.org             66
mailto:michal.zaremba@deri.org             66
http://twitter.com/dsifry                  65<br />
*** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete 
results, query interrupted by result timeout.  
Activity:      1R rnd      0R seq      0P disk  1.346KB /      3 messages<br />
SQL&gt; sparql 
SELECT ?celeb, 
       COUNT (*)
WHERE { ?claimant foaf:knows ?celeb .
        FILTER (!bif:exists ( SELECT (1) 
                              WHERE { ?celeb foaf:knows ?claimant }
                            )
               )
      } 
GROUP BY ?celeb 
ORDER BY DESC 2 
LIMIT 10;<br />
celeb                                      callret-1
VARCHAR                                    VARCHAR
________________________________________   _________<br />
http://twitter.com/JasonCalacanis          496
http://twitter.com/Twitterrific            466
http://twitter.com/ev                      442
http://twitter.com/BarackObama             356
http://twitter.com/laughingsquid           317
http://twitter.com/gruber                  294
http://twitter.com/chrispirillo            259
http://twitter.com/ambermacarthur          224
http://twitter.com/t                       219
http://twitter.com/johnedwards             188<br />
*** Error S1TAT: [Virtuoso Driver][Virtuoso Server]RC...: Returning incomplete 
results, query interrupted by result timeout.  
Activity:    329R rnd   44.6KR seq    342P disk  638.4KB /     46 messages</pre></blockquote>

<p>The first query read all data from disk; the second run had the working set from the first and could read some more before time ran out, hence the results were better.  But the response time was the same.</p>

<p>If one has a query that just loops over consecutive joins, like in basic SPARQL, interrupting the processing after a set time period is simple.  But such queries are not very interesting.  To give meaningful partial answers with nested aggregation and sub-queries requires some more tricks.  The basic idea is to terminate the innermost active sub-query/aggregation at the first timeout, and extend the timeout a bit so that accumulated results get fed to the next aggregation, like from the <code>GROUP BY</code> to the <code>ORDER BY</code>.  If this again times out, we continue with the next outer layer.  This guarantees that results are delivered if there were any results found for which the query pattern is true.  False results are not produced, except in cases where there is comparison with a count and the count is smaller than it would be with the full evaluation.</p>

<p>One can also use this as a basis for paid services.  The cutoff does not have to be time; it can also be in other units, making it insensitive to concurrent usage and variations of working set.</p>

<p>This system will be deployed on our <a href="http://challenge.semanticweb.org/" id="link-id11500a58">Billion Triples Challenge</a> <a href="http://b3s.openlinksw.com/" id="link-id11683120">demo instance</a> in a few days, after some more testing.  When Virtuoso 6 ships, all <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id1157a500">LOD</a> Cloud AMIs and OpenLink-hosted LOD Cloud SPARQL endpoints will have this enabled by default.  (AMI users will be able to disable the feature, if desired.)  The feature works with Virtuoso 6 in both single server and cluster deployment.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-11-28#1489">
  <rss:title>Introducing Virtuoso Universal Server (Cloud Edition) for Amazon EC2</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-11-28T19:27:12Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">What is it? A pre-installed edition of Virtuoso for Amazon&#39;s EC2 Cloud platform. What does it offer? From a Web Entrepreneur perspective it offers: Low cost entry point to a game-changing Web 3.0+ (and beyond) platform that combines SQL, RDF, XML, and Web Services functionality Flexible variable cost model (courtesy of EC2 DevPay) tightly bound to revenue generated by your services Delivers federated and/or centralized model flexibility for you SaaS based solutions Simple entry point for developing and deploying sophisticated database driven applications (SQL or RDF Linked Data Web oriented) Complete framework for exploiting OpenID, OAuth (including Role enhancements) that simplifies exploitation of these vital Identity and Data Access technologies Easily implement RDF Linked Data based Mail, Blogging, Wikis, Bookmarks, Calendaring, Discussion Forums, Tagging, Social-Networking as Data Space (data containers) features of your application or service offering Instant alleviation of challenges (e.g. service costs and agility) associated with Data Portability and Open Data Access across Web 2.0 data silos LDAP integration for Intranet / Extranet style applications. From the DBMS engine perspective it provides you with one or more pre-configured instances of Virtuoso that enable immediate exploitation of the following services: RDF Database (a Quad Store with SPARQL &amp; SPARUL Language &amp; Protocol support) SQL Database (with ODBC, JDBC, OLE-DB, ADO.NET, and XMLA driver access) XML Database (XML Schema, XQuery/Xpath, XSLT, Full Text Indexing) Full Text Indexing. From a Middleware perspective it provides: RDF Views (Wrappers / Semantic Covers) over SQL, XML, and other data sources accessible via SOAP or REST style Web Services Sponger Service for converting non RDF information resources into RDF Linked Data &quot;on the fly&quot; via a large collection of pre-installed RDFizer Cartridges. From the Web Server Platform perspective it provides an alternative to LAMP stack components such as MySQL and Apace by offering HTTP Web Server WebDAV Server Web Application Server (includes PHP runtime hosting) SOAP or REST style Web Services Deployment RDF Linked Data Deployment SPARQL (SPARQL Query Language) and SPARUL (SPARQL Update Language) endpoints Virtuoso Hosted PHP packages for MediaWiki, Drupal, Wordpress, and phpBB3 (just install the relevant Virtuoso Distro. Package). From the general System Administrator&#39;s perspective it provides: Online Backups (Backup Set dispatched to S3 buckets, FTP, or HTTP/WebDAV server locations) Synchronized Incremental Backups to Backup Set locations Backup Restore from Backup Set location (without exiting to EC2 shell). Higher level user oriented offerings include: OpenLink Data Explorer front-end for exploring the burgeoning Linked Data Web Ajax based SPARQL Query Builder (iSPARQL) that enables SPARQL Query construction by Example Ajax based SQL Query Builder (QBE) that enables SQL Query construction by Example. For Web 2.0 / 3.0 users, developers, and entrepreneurs it offers it includes Distributed Collaboration Tools &amp; Social Media realm functionality courtesy of ODS that includes: Point of presence on the Linked Data Web that meshes your Identity and your Data via URIs System generated Social Network Profile &amp; Contact Data via FOAF? System generated SIOC (Semantically Interconnected Online Community) Data Space (that includes a Social Graph) exposing all your Web data in RDF Linked Data form System generated OpenID and automatic integration with FOAF Transparent Data Integration across Facebook, Digg, LinkedIn, FriendFeed, Twitter, and any other Web 2.0 data space equipped with RSS / Atom support and/or REST style Web Services In-built support for SyncML which enables data synchronization with Mobile Phones. How Do I Get Going with It? Standard Installation Guide Personal or Service Specific DBpedia Installation Guide</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<h3>What is it?</h3>
<p>A pre-installed edition of <a href="http://virtuoso.openlinksw.com" id="link-id14bea838">Virtuoso</a> for Amazon&#39;s EC2 Cloud platform.</p>

<h3>What does it offer?</h3>
From a <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> Entrepreneur perspective it offers:
<ol>
<li>
Low cost entry point to a game-changing Web 3.0+ (and beyond) platform that combines <a href="http://dbpedia.org/resource/SQL" id="link-id11309b38">SQL</a>, <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id135f7988">RDF</a>, XML, and Web Services functionality</li>
<li>
Flexible variable cost model (courtesy of <a href="http://aws.amazon.com/devpay/" id="link-id17941018">EC2 DevPay</a>) tightly bound to revenue generated by your services</li>
<li>
Delivers federated and/or centralized model flexibility for you SaaS based solutions</li>
<li>
Simple entry point for developing and deploying sophisticated database driven applications (SQL or RDF <a href="http://dbpedia.org/resource/Linked_Data" id="link-id14ea6b10">Linked Data Web</a> oriented)</li>
<li>
Complete framework for exploiting OpenID, OAuth (including Role enhancements) that simplifies exploitation of these vital Identity and <a href="http://dbpedia.org/resource/Data">Data</a> Access technologies</li>
<li>Easily implement RDF Linked Data based Mail, Blogging, Wikis, Bookmarks, Calendaring, Discussion Forums, Tagging, Social-Networking as <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id11519928">Data Space</a> (data containers) features of your application or service offering</li>
<li>Instant alleviation of challenges (e.g. service costs and agility) associated with <a href="http://dbpedia.org/resource/DataPortability" id="link-id111cb610">Data Portability</a> and Open Data Access across Web 2.0 data silos</li>
<li>
LDAP integration for <a href="http://dbpedia.org/resource/Intranet" id="link-id114a8270">Intranet</a> / <a href="http://dbpedia.org/resource/Extranet" id="link-id10fe4f08">Extranet</a> style applications.</li>
</ol>
<p>From the DBMS engine perspective it provides you with one or more pre-configured instances of Virtuoso that enable immediate exploitation of the following services:</p>
<ol>
<li>
RDF Database (a Quad Store with <a href="http://dbpedia.org/resource/SPARQL" id="link-id11911bf8">SPARQL</a> &amp; SPARUL Language &amp; Protocol support)</li>
<li>
  <a href="http://dbpedia.org/resource/SQL" id="link-id110544c8">SQL</a> Database (with <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id1524c7d0">ODBC</a>, <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id14cfb658">JDBC</a>, OLE-DB, <a href="http://dbpedia.org/resource/ADO.NET" id="link-id110ec6c8">ADO</a>.NET, and XMLA driver access)</li>
<li>XML Database (XML Schema, <a href="http://dbpedia.org/resource/XQuery" id="link-id10ebf218">XQuery</a>/<a href="http://dbpedia.org/resource/XPath" id="link-id142a7898">Xpath</a>, XSLT, Full Text Indexing)</li>
<li>Full Text Indexing.</li>
</ol>

<p>From a Middleware perspective it provides:</p>
<ol>
<li>
RDF Views (Wrappers / Semantic Covers) over SQL, XML, and other data sources accessible via SOAP or REST style Web Services</li>
<li>
Sponger Service for converting non RDF <a href="http://dbpedia.org/resource/Information" id="link-id11931c60">information</a> resources into RDF <a href="http://dbpedia.org/resource/Linked_Data" id="link-id118f7168">Linked Data</a> &quot;on the fly&quot; via a large collection of pre-installed  RDFizer Cartridges.</li>
</ol>

<p>From the Web Server Platform perspective it provides an alternative to LAMP stack components such as <a href="http://dbpedia.org/resource/MySQL" id="link-id10f7b780">MySQL</a> and Apace by offering</p>
<ol>
<li>
HTTP Web Server</li>
<li>
WebDAV Server</li>
<li>
Web <a href="http://dbpedia.org/resource/Application_server" id="link-id1268daa8">Application Server</a> (includes <a href="http://dbpedia.org/resource/PHP" id="link-id1585d238">PHP</a> runtime hosting)</li>
<li>
SOAP or REST style Web Services Deployment</li>
<li>
RDF Linked Data Deployment</li>
<li>
SPARQL (SPARQL Query Language) and SPARUL (SPARQL Update Language) endpoints</li>
<li>Virtuoso Hosted PHP packages for <a href="http://dbpedia.org/resource/MediaWiki" id="link-id15568818">MediaWiki</a>, <a href="http://dbpedia.org/resource/Drupal" id="link-id110bd7a8">Drupal</a>, <a href="http://dbpedia.org/resource/WordPress" id="link-id10f66918">Wordpress</a>, and <a href="http://dbpedia.org/resource/PhpBB" id="link-id13fda4d0">phpBB3</a> (just install the relevant Virtuoso Distro. Package).
</li>
</ol>

<p>From the general System Administrator&#39;s perspective it provides:</p>
<ol>
<li>
Online Backups (Backup Set dispatched to S3 buckets, FTP, or HTTP/WebDAV server locations)</li>
<li>Synchronized Incremental Backups to Backup Set locations</li>
<li>Backup Restore from Backup Set location (without exiting to EC2 shell).</li>
</ol>

<p>Higher level user oriented offerings include:</p>
<ol>
<li>OpenLink Data Explorer front-end for exploring the burgeoning Linked Data <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id11646dc8">Web</a>
</li>
<li>
Ajax based SPARQL Query Builder (iSPARQL) that enables SPARQL Query construction by Example</li>
<li>Ajax based SQL Query Builder (QBE) that enables SQL Query construction by Example.</li>
</ol>

<p>For Web 2.0 / 3.0 users, developers, and entrepreneurs it offers it includes Distributed Collaboration Tools &amp; Social Media realm functionality courtesy of <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id11009930">ODS</a> that includes:</p>
<ol>
<li>
Point of presence on the Linked Data Web that meshes your Identity and your Data via URIs</li>
<li>
System generated Social Network Profile &amp; Contact Data via <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id1185a1c0">FOAF</a>?</li>
<li>
System generated <a href="http://dbpedia.org/resource/SIOC" id="link-id14791890">SIOC</a> (Semantically Interconnected Online Community) <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id1577cad8">Data Space</a> (that includes a Social Graph) exposing all your Web data in RDF Linked Data form</li>
<li>
System generated OpenID and automatic integration with FOAF</li>
<li>
Transparent Data Integration across Facebook, Digg, LinkedIn, FriendFeed, Twitter, and any other Web 2.0 data space equipped with RSS / Atom support and/or REST style Web Services</li>
<li>
In-built support for SyncML which enables data synchronization with Mobile Phones.</li>
</ol>
<h3>How Do I Get Going with It?</h3>
<ul>
<li>
<a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/ODSInstallationEC2" id="link-id114e1600">Standard Installation Guide</a>
</li>
<li>
<a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/VirtEC2AMIDBpediaInstall" id="link-id110a98e8">Personal or Service Specific DBpedia Installation Guide</a>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-11-27#1488">
  <rss:title>An Example of RDF Scalability</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-11-27T11:23:47Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We hear it to exhaustion, where is RDF scalability? We have been suggesting for a while that this is a solved question. I will here give some concrete numbers to back this. The scalability dream is to add hardware and get increased performance in proportion to the power the added component has when measured by itself. A corollary dream is to take scalability effects that are measured in a simple task and see them in a complex task. Below we show how we do 3.3 million random triple lookups per second on two 8 core commodity servers producing complete results, joining across partitions. On a single 4 core server, the figure is about 1 million lookups per second. With a single thread, it is about 250K lookups per second. This is the good case. But even our worse case is quite decent. We took a simple SPARQL query, counting how many people say they reciprocally know each other. In the Billion Triples Challenge data set, there are 25M foaf:knows quads of which 92K are reciprocal. Reciprocal here means that when x knows y in some graph, y knows x in the same or any other graph. SELECT COUNT (*) WHERE { ?p1 foaf:knows ?p2 . ?p2 foaf:knows ?p1 } There is no guarantee that the triple of x knows y is in the same partition as the triple y knows x. Thus the join is randomly distributed, n partitions to n partitions. We left this out of the Billion Triples Challenge demo because this did not run fast enough for our liking. Since then, we have corrected this. If run on a single thread, this query would be a loop over all the quads with a predicate of foaf:knows, and an inner loop looking for a quad with 3 of 4 fields given (SPO). If we have a partitioned situation, we have a loop over all the foaf:knows quads in each partition, and an inner lookup looking for the reciprocal foaf:knows quad in whatever partition it may be found. We have implemented this with two different message patterns: Centralized: One process reads all the foaf:knows quads from all processes. Every 50K quads, it sends a batch of reciprocal quad checks to each partition that could contain a reciprocal quad. Each partition keeps the count of found reciprocal quads, and these are gathered and added up at the end. Symmetrical: Each process reads the foaf:knows quads in its partition, and sends a batch of checks to each process that could have the reciprocal foaf:knows quad every 50K quads. At the end, the counts are gathered from all partitions. There is some additional control traffic but we do not go into its details here. Below is the result measured on 2 machines each with 2 x Xeon 5345 (quad core; total 8 cores), 16G RAM, and each machine running 6 Virtuoso instances. The interconnect is dual 1-Gbit ethernet. Numbers are with warm cache. Centralized: 35,543 msec, 728,634 sequential + random lookups per second Cluster 12 nodes, 35 s. 1072 m/s 39,085 KB/s 316% cpu ... Symmetrical: 7706 msec, 3,360,740 sequential + random lookups per second Cluster 12 nodes, 7 s. 572 m/s 16,983 KB/s 1137% cpu ... The second line is the summary from the cluster status report for the duration of the query. The interesting numbers are the KB/s and the %CPU. The former is the cross-sectional data transfer rate for intra-cluster communication; the latter is the consolidated CPU utilization, where a constantly-busy core counts for 100%. The point to note is that the symmetrical approach takes 4x less real time with under half the data transfer rate. Further, when using multiple machines, the speed of a single interface does not limit the overall throughput as it does in the centralized situation. These figures represent the best and worst cases of distributed JOINing. If we have a straight sequence of JOINs, with single pattern optionals and existences and the order in which results are produced is not significant (i.e., there is aggregation, existence test, or ORDER BY), the symmetrical pattern is applicable. On the other hand, if there are multiple triple pattern optionals, complex sub-queries, DISTINCTs in the middle of the query, or results have to be produced in the order of an index, then the centralized approach must be used at least part of the time. Also, if we must make transitive closures, which can be thought of as an extension of a DISTINCT in a subquery, we must pass the data through a single point before moving the bindings to the next JOIN in the sequence. This happens for example in resolving owl:sameAs at run time. However, the good news is that performance does not fall much below the centralized figure even when there are complex nested structures with intermediate transitive closures, DISTINCTs, complex existence tests, etc., that require passing all intermediate results through a central point. No matter the complexity, it is always possible to vector some tens-of-thousands of variable bindings into a single message exchange. And if there are not that many intermediate results, then single query execution time is not a problem anyhow. For our sample query, we would get still more speed by using a partitioned hash join, filling the hash from the foaf:knows relations and then running the foaf:knows relations through the hash. If the hash size is right, a hash lookup is somewhat better than an index lookup. The problem is that when the hash join is not the right solution, it is an expensive mistake: the best case is good; the worst case is very bad. But if there is no index then hash join is better than nothing. One problem of hash joins is that they make temporary data structures which, if large, will skew the working set. One must be quite sure of the cardinality before it is safe to try a hash join. So we do not do hash joins with RDF, but we do use them sometimes with relational data. These same methods apply to relational data just as well. This does not make generic RDF storage outperform an application-specific relational representation on the same platform, as the latter benefits from all the same optimizations, but in terms of sheer numbers, this makes RDF representation an option where it was not an option before. RDF is all about not needing to design the schema around the queries, and not needing to limit what joins with what else.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>We hear it to exhaustion, where is <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x14e828d8">RDF</a> scalability?  We have been suggesting for a while that this is a solved question.  I will here give some concrete numbers to back this.</p>

<p>The scalability dream is to add hardware and get increased performance in proportion to the power the added component has when measured by itself. A corollary dream is to take scalability effects that are measured in a simple task and see them in a complex task.</p>

<p>Below we show how we do 3.3 million random triple lookups per second on two 8 core commodity servers producing complete results, joining across partitions. On a single 4 core server, the figure is about 1 million lookups per second.  With a single thread, it is about 250K lookups per second.  This is the good case.  But even our worse case is quite decent.</p>

<p>We took a simple <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x14fef850">SPARQL</a> query, counting how many people say they reciprocally know each other.  In the <a href="http://challenge.semanticweb.org/" id="link-id0x1bca04d0">Billion Triples Challenge</a> <a href="http://dbpedia.org/resource/Data" id="link-id0x1be84e88">data</a> set, there are 25M <code>foaf:knows</code> quads of which 92K are reciprocal. <i>Reciprocal</i> here means that when x knows y in some graph, y knows x in the same or any other graph.</p>

<pre>SELECT COUNT (*) 
WHERE { 
         ?p1  foaf:knows  ?p2  . 
         ?p2  foaf:knows  ?p1 
      }</pre>

<p>There is no guarantee that the triple of <code>x knows y</code> is in the same partition as the triple y knows x.  Thus the join is randomly distributed, n partitions to n partitions.</p>

<p>We left this out of the Billion Triples Challenge demo because this did not run fast enough for our liking.  Since then, we have corrected this.</p>

<p>If run on a single thread, this query would be a loop over all the quads with a predicate of <code>foaf:knows</code>, and an inner loop looking for a quad with 3 of 4 fields given (<code>SPO</code>). If we have a partitioned situation, we have a loop over all the <code>foaf:knows</code> quads in each partition, and an inner lookup looking for the reciprocal <code>foaf:knows</code> quad in whatever partition it may be found.</p>

<p>We have implemented this with two different message patterns: </p>

<ol>
 <li>
  <p>
    <b>Centralized:</b> One process reads all the <code>foaf:knows</code> quads from all processes.  Every 50K quads, it sends a batch of reciprocal quad checks to each partition that could contain a reciprocal quad.  Each partition keeps the count of found reciprocal quads, and these are gathered and added up at the end.</p>
 </li>

<li>
  <p>
    <b>Symmetrical:</b> Each process reads the <code>foaf:knows</code> quads in its partition, and sends a batch of checks to each process that could have the reciprocal <code>foaf:knows</code> quad every 50K quads.  At the end, the counts are gathered from all partitions.  There is some additional control traffic but we do not go into its details here.</p>
</li>
</ol>

<p>Below is the result measured on 2 machines each with 2 x Xeon 5345 (quad core; total 8 cores), 16G RAM, and each machine running 6 <a href="http://virtuoso.openlinksw.com" id="link-id0x16642a90">Virtuoso</a> instances.  The interconnect is dual 1-Gbit ethernet. Numbers are with warm cache.</p>

<blockquote>
<code>Centralized:  35,543 msec,  728,634 sequential + random lookups per second <br />
Cluster 12 nodes, 35 s. 1072 m/s 39,085 KB/s  316% cpu ...
 <br /> <br />
Symmetrical:  7706 msec, 3,360,740 sequential + random lookups per second  <br />
Cluster 12 nodes, 7 s. 572 m/s 16,983 KB/s  1137% cpu ...</code>
</blockquote>

<p>The second line is the summary from the cluster status report for the duration of the query.  The interesting numbers are the KB/s and the %CPU.  The former is the cross-sectional data transfer rate for intra-cluster communication; the latter is the consolidated CPU utilization, where a constantly-busy core counts for 100%.  The point to note is that the symmetrical approach takes 4x less real time with under half the data transfer rate.  Further, when using multiple machines, the speed of a single interface does not limit the overall throughput as it does in the centralized situation.</p>

<p>These figures represent the best and worst cases of distributed <code>JOIN</code>ing.  If we have a straight sequence of <code>JOIN</code>s, with single pattern optionals and existences and the order in which results are produced is not significant (i.e., there is aggregation, existence test, or <code>ORDER BY</code>), the symmetrical pattern is applicable.  On the other hand, if there are multiple triple pattern optionals, complex sub-queries, <code>DISTINCT</code>s in the middle of the query, or results have to be produced in the order of an index, then the centralized approach must be used at least part of the time.</p>

<p>Also, if we must make transitive closures, which can be thought of as an extension of a <code>DISTINCT</code> in a subquery, we must pass the data through a single point before moving the bindings to the next <code>JOIN</code> in the sequence. This happens for example in resolving <code><a href="http://dbpedia.org/resource/Web_Ontology_Language" id="link-id0x14e1a160">owl</a>:sameAs</code> at run time.  However, the good news is that performance does not fall much below the centralized figure even when there are complex nested structures with intermediate transitive closures, <code>DISTINCT</code>s, complex existence tests, etc., that require passing all intermediate results through a central point. No matter the complexity, it is always possible to vector some tens-of-thousands of variable bindings into a single message exchange.  And if there are not that many intermediate results, then single query execution time is not a problem anyhow.</p>

<p>For our sample query, we would get still more speed by using a partitioned hash join, filling the hash from the <code>foaf:knows</code> relations and then running the <code>foaf:knows</code> relations through the hash.  If the hash size is right, a hash lookup is somewhat better than an index lookup.  The problem is that when the hash join is not the right solution, it is an expensive mistake:  the best case is good; the worst case is very bad. But if there is no index then hash join is better than nothing.  One problem of hash joins is that they make temporary data structures which, if large, will skew the working set.  One must be quite sure of the cardinality before it is safe to try a hash join.  So we do not do hash joins with RDF, but we do use them sometimes with relational data. </p>

<p>These same methods apply to relational data just as well.  This does not make generic RDF storage outperform an application-specific relational representation on the same platform, as the latter benefits from all the same optimizations, but in terms of sheer numbers, this makes RDF representation an option where it was not an option before. RDF is all about not needing to design the schema around the queries, and not needing to limit what joins with what else.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-11-27#1487">
  <rss:title>An Example of RDF Scalability</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-11-27T11:23:47Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We hear it to exhaustion, where is RDF scalability? We have been suggesting for a while that this is a solved question. I will here give some concrete numbers to back this. The scalability dream is to add hardware and get increased performance in proportion to the power the added component has when measured by itself. A corollary dream is to take scalability effects that are measured in a simple task and see them in a complex task. Below we show how we do 3.3 million random triple lookups per second on two 8 core commodity servers producing complete results, joining across partitions. On a single 4 core server, the figure is about 1 million lookups per second. With a single thread, it is about 250K lookups per second. This is the good case. But even our worse case is quite decent. We took a simple SPARQL query, counting how many people say they reciprocally know each other. In the Billion Triples Challenge data set, there are 25M foaf:knows quads of which 92K are reciprocal. Reciprocal here means that when x knows y in some graph, y knows x in the same or any other graph. SELECT COUNT (*) WHERE { ?p1 foaf:knows ?p2 . ?p2 foaf:knows ?p1 } There is no guarantee that the triple of x knows y is in the same partition as the triple y knows x. Thus the join is randomly distributed, n partitions to n partitions. We left this out of the Billion Triples Challenge demo because this did not run fast enough for our liking. Since then, we have corrected this. If run on a single thread, this query would be a loop over all the quads with a predicate of foaf:knows, and an inner loop looking for a quad with 3 of 4 fields given (SPO). If we have a partitioned situation, we have a loop over all the foaf:knows quads in each partition, and an inner lookup looking for the reciprocal foaf:knows quad in whatever partition it may be found. We have implemented this with two different message patterns: Centralized: One process reads all the foaf:knows quads from all processes. Every 50K quads, it sends a batch of reciprocal quad checks to each partition that could contain a reciprocal quad. Each partition keeps the count of found reciprocal quads, and these are gathered and added up at the end. Symmetrical: Each process reads the foaf:knows quads in its partition, and sends a batch of checks to each process that could have the reciprocal foaf:knows quad every 50K quads. At the end, the counts are gathered from all partitions. There is some additional control traffic but we do not go into its details here. Below is the result measured on 2 machines each with 2 x Xeon 5345 (quad core; total 8 cores), 16G RAM, and each machine running 6 Virtuoso instances. The interconnect is dual 1-Gbit ethernet. Numbers are with warm cache. Centralized: 35,543 msec, 728,634 sequential + random lookups per second Cluster 12 nodes, 35 s. 1072 m/s 39,085 KB/s 316% cpu ... Symmetrical: 7706 msec, 3,360,740 sequential + random lookups per second Cluster 12 nodes, 7 s. 572 m/s 16,983 KB/s 1137% cpu ... The second line is the summary from the cluster status report for the duration of the query. The interesting numbers are the KB/s and the %CPU. The former is the cross-sectional data transfer rate for intra-cluster communication; the latter is the consolidated CPU utilization, where a constantly-busy core counts for 100%. The point to note is that the symmetrical approach takes 4x less real time with under half the data transfer rate. Further, when using multiple machines, the speed of a single interface does not limit the overall throughput as it does in the centralized situation. These figures represent the best and worst cases of distributed JOINing. If we have a straight sequence of JOINs, with single pattern optionals and existences and the order in which results are produced is not significant (i.e., there is aggregation, existence test, or ORDER BY), the symmetrical pattern is applicable. On the other hand, if there are multiple triple pattern optionals, complex sub-queries, DISTINCTs in the middle of the query, or results have to be produced in the order of an index, then the centralized approach must be used at least part of the time. Also, if we must make transitive closures, which can be thought of as an extension of a DISTINCT in a subquery, we must pass the data through a single point before moving the bindings to the next JOIN in the sequence. This happens for example in resolving owl:sameAs at run time. However, the good news is that performance does not fall much below the centralized figure even when there are complex nested structures with intermediate transitive closures, DISTINCTs, complex existence tests, etc., that require passing all intermediate results through a central point. No matter the complexity, it is always possible to vector some tens-of-thousands of variable bindings into a single message exchange. And if there are not that many intermediate results, then single query execution time is not a problem anyhow. For our sample query, we would get still more speed by using a partitioned hash join, filling the hash from the foaf:knows relations and then running the foaf:knows relations through the hash. If the hash size is right, a hash lookup is somewhat better than an index lookup. The problem is that when the hash join is not the right solution, it is an expensive mistake: the best case is good; the worst case is very bad. But if there is no index then hash join is better than nothing. One problem of hash joins is that they make temporary data structures which, if large, will skew the working set. One must be quite sure of the cardinality before it is safe to try a hash join. So we do not do hash joins with RDF, but we do use them sometimes with relational data. These same methods apply to relational data just as well. This does not make generic RDF storage outperform an application-specific relational representation on the same platform, as the latter benefits from all the same optimizations, but in terms of sheer numbers, this makes RDF representation an option where it was not an option before. RDF is all about not needing to design the schema around the queries, and not needing to limit what joins with what else.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>We hear it to exhaustion, where is <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1eab4128">RDF</a> scalability?  We have been suggesting for a while that this is a solved question.  I will here give some concrete numbers to back this.</p>

<p>The scalability dream is to add hardware and get increased performance in proportion to the power the added component has when measured by itself. A corollary dream is to take scalability effects that are measured in a simple task and see them in a complex task.</p>

<p>Below we show how we do 3.3 million random triple lookups per second on two 8 core commodity servers producing complete results, joining across partitions. On a single 4 core server, the figure is about 1 million lookups per second.  With a single thread, it is about 250K lookups per second.  This is the good case.  But even our worse case is quite decent.</p>

<p>We took a simple <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x15cb3da8">SPARQL</a> query, counting how many people say they reciprocally know each other.  In the <a href="http://challenge.semanticweb.org/" id="link-id0x1bfb7a00">Billion Triples Challenge</a> <a href="http://dbpedia.org/resource/Data" id="link-id0xa57187d8">data</a> set, there are 25M <code>foaf:knows</code> quads of which 92K are reciprocal. <i>Reciprocal</i> here means that when x knows y in some graph, y knows x in the same or any other graph.</p>

<pre>SELECT COUNT (*) 
WHERE { 
         ?p1  foaf:knows  ?p2  . 
         ?p2  foaf:knows  ?p1 
      }</pre>

<p>There is no guarantee that the triple of <code>x knows y</code> is in the same partition as the triple y knows x.  Thus the join is randomly distributed, n partitions to n partitions.</p>

<p>We left this out of the Billion Triples Challenge demo because this did not run fast enough for our liking.  Since then, we have corrected this.</p>

<p>If run on a single thread, this query would be a loop over all the quads with a predicate of <code>foaf:knows</code>, and an inner loop looking for a quad with 3 of 4 fields given (<code>SPO</code>). If we have a partitioned situation, we have a loop over all the <code>foaf:knows</code> quads in each partition, and an inner lookup looking for the reciprocal <code>foaf:knows</code> quad in whatever partition it may be found.</p>

<p>We have implemented this with two different message patterns: </p>

<ol>
 <li>
  <p>
    <b>Centralized:</b> One process reads all the <code>foaf:knows</code> quads from all processes.  Every 50K quads, it sends a batch of reciprocal quad checks to each partition that could contain a reciprocal quad.  Each partition keeps the count of found reciprocal quads, and these are gathered and added up at the end.</p>
 </li>

<li>
  <p>
    <b>Symmetrical:</b> Each process reads the <code>foaf:knows</code> quads in its partition, and sends a batch of checks to each process that could have the reciprocal <code>foaf:knows</code> quad every 50K quads.  At the end, the counts are gathered from all partitions.  There is some additional control traffic but we do not go into its details here.</p>
</li>
</ol>

<p>Below is the result measured on 2 machines each with 2 x Xeon 5345 (quad core; total 8 cores), 16G RAM, and each machine running 6 <a href="http://virtuoso.openlinksw.com" id="link-id0x1c0c94a8">Virtuoso</a> instances.  The interconnect is dual 1-Gbit ethernet. Numbers are with warm cache.</p>

<blockquote>
<code>Centralized:  35,543 msec,  728,634 sequential + random lookups per second <br />
Cluster 12 nodes, 35 s. 1072 m/s 39,085 KB/s  316% cpu ...
 <br /> <br />
Symmetrical:  7706 msec, 3,360,740 sequential + random lookups per second  <br />
Cluster 12 nodes, 7 s. 572 m/s 16,983 KB/s  1137% cpu ...</code>
</blockquote>

<p>The second line is the summary from the cluster status report for the duration of the query.  The interesting numbers are the KB/s and the %CPU.  The former is the cross-sectional data transfer rate for intra-cluster communication; the latter is the consolidated CPU utilization, where a constantly-busy core counts for 100%.  The point to note is that the symmetrical approach takes 4x less real time with under half the data transfer rate.  Further, when using multiple machines, the speed of a single interface does not limit the overall throughput as it does in the centralized situation.</p>

<p>These figures represent the best and worst cases of distributed <code>JOIN</code>ing.  If we have a straight sequence of <code>JOIN</code>s, with single pattern optionals and existences and the order in which results are produced is not significant (i.e., there is aggregation, existence test, or <code>ORDER BY</code>), the symmetrical pattern is applicable.  On the other hand, if there are multiple triple pattern optionals, complex sub-queries, <code>DISTINCT</code>s in the middle of the query, or results have to be produced in the order of an index, then the centralized approach must be used at least part of the time.</p>

<p>Also, if we must make transitive closures, which can be thought of as an extension of a <code>DISTINCT</code> in a subquery, we must pass the data through a single point before moving the bindings to the next <code>JOIN</code> in the sequence. This happens for example in resolving <code><a href="http://dbpedia.org/resource/Web_Ontology_Language" id="link-id0x28005280">owl</a>:sameAs</code> at run time.  However, the good news is that performance does not fall much below the centralized figure even when there are complex nested structures with intermediate transitive closures, <code>DISTINCT</code>s, complex existence tests, etc., that require passing all intermediate results through a central point. No matter the complexity, it is always possible to vector some tens-of-thousands of variable bindings into a single message exchange.  And if there are not that many intermediate results, then single query execution time is not a problem anyhow.</p>

<p>For our sample query, we would get still more speed by using a partitioned hash join, filling the hash from the <code>foaf:knows</code> relations and then running the <code>foaf:knows</code> relations through the hash.  If the hash size is right, a hash lookup is somewhat better than an index lookup.  The problem is that when the hash join is not the right solution, it is an expensive mistake:  the best case is good; the worst case is very bad. But if there is no index then hash join is better than nothing.  One problem of hash joins is that they make temporary data structures which, if large, will skew the working set.  One must be quite sure of the cardinality before it is safe to try a hash join.  So we do not do hash joins with RDF, but we do use them sometimes with relational data. </p>

<p>These same methods apply to relational data just as well.  This does not make generic RDF storage outperform an application-specific relational representation on the same platform, as the latter benefits from all the same optimizations, but in terms of sheer numbers, this makes RDF representation an option where it was not an option before. RDF is all about not needing to design the schema around the queries, and not needing to limit what joins with what else.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-10-24#1460">
  <rss:title>State of the Semantic Web, Part 1 - Sociology, Business, and Messaging (update 2)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-10-24T10:19:03Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I was in Vienna for the Linked Data Practitioners gathering this week. Danny Ayers asked me if I would blog about the State of the Semantic Web or write the This Week&#39;s Semantic Web column. I don&#39;t have the time to cover all that may have happened during the past week but I will editorialize about the questions that again were raised in Vienna. How these things relate to Virtuoso will be covered separately. This is about the overarching questions of the times, not the finer points of geek craft. SÃ¶ren Auer asked me to say a few things about relational to RDF mapping. I will cite some highlights from this, as they pertain to the general scene. There was an &quot;open hacking&quot; session Wednesday night featuring lightning talks. I will use some of these too as a starting point. The messaging? The SWEO (Semantic Web Education and Outreach) interest group of the W3C spent some time looking for an elevator pitch for the Semantic Web. It became &quot;Data Unleashed.&quot; Why not? Let&#39;s give this some context. So, if we are holding a Semantic Web 101 session, where should we begin? I hazard to guess that we should not begin by writing a FOAF file in Turtle by hand, as this is one thing that is not likely to happen in the real world. Of course, the social aspect of the Data Web is the most immediately engaging, so a demo might be to go make an account with myopenlink.net and see that after one has entered the data one normally enters for any social network, one has become a Data Web citizen. This means that one can be found, just like this, with a query against the set of data spaces hosted on the system. Then we just need a few pages that repurpose this data and relate it to other data. We show some samples of queries like this in our Billion Triples Challenge demo. We will make a webcast about this to make it all clearer. Behold: The Data Web is about the world becoming a database; writing SPARQL queries or triples is incidental. You will write FOAF files by hand just as little as you now write SQL insert statements for filling in your account information on Myspace. Every time there is a major shift in technology, this shift needs to be motivated by addressing a new class of problem. This means doing something that could not be done before. The last time this happened was when the relational database became the dominant IT technology. At that time, the questions involved putting the enterprise in the database and building a cluster of Line Of Business (LOB) applications around the database. The argument for the RDBMS was that you did not have to constrain the set of queries that might later be made, when designing the database. In other words, it was making things more ad hoc. This was opposed then on grounds of being less efficient than the hierarchical and network databases which the relational eventually replaced. Today, the point of the Data Web is that you do not have to constrain what your data can join or integrate with, when you design your database. The counter-argument is that this is slow and geeky and not scalable. See the similarity? A difference is that we are not specifically aiming at replacing the RDBMS. In fact, if you know exactly what you will query and have a well defined workload, a relational representation optimized for the workload will give you about 10x the performance of the equivalent RDF warehouse. OLTP remains a relational-only domain. However, when we are talking about doing queries and analytics against the Web, or even against more than a handful of relational systems, the things which make RDBMS good become problematic. What is the business value of this? The most reliable of human drives is the drive to make oneself known. This drives all, from any social scene to business communications to politics. Today, when you want to proclaim you exist, you do so first on the Web. The Web did not become the prevalent media because business loved it for its own sake, it became prevalent because business could not afford not to assert their presence there. If anything, the Web eroded the communications dominance of a lot of players, which was not welcome but still had to be dealt with, by embracing the Web. Today, in a world driven by data, the Data Web will be catalyzed by similar factors: If your data is not there, you will not figure in query results. Search engines will play some role there but also many social applications will have reports that are driven by published data. Also consider any e-commerce, any marketplace, and so forth. The Data Portability movement is a case in point: Users want to own their own content; silo operators want to capitalize on holding it. Right now, we see these things in silos; the Data Web will create bridges between these, and what is now in silo data centers will be increasingly available on an ad hoc basis with Open Data. Again, we see a movement from the specialized to the generic: What LinkedIn does in its data center can be done with ad hoc queries with linked open data. Of course, LinkedIn does these things somewhat more efficiently because their system is built just for this task, but the linked data approach has the built-in readiness to join with everything else at almost no cost, without making a new data warehouse for each new business question. We could call this the sociological aspect of the thing. Getting to more concrete business, we see an economy that, we could say, without being alarmists, is confronted with some issues. Well, generally when times are bad, this results in consolidation of property and power. Businesses fail and get split up and sold off in pieces, government adds controls and regulations and so forth. This means ad hoc data integration, as control without data is just pretense. If times are lean, this also means that there is little readiness to do wholesale replacement of systems, which will take years before producing anything. So we must play with what there is and make it deliver, in ways and conditions that were not necessarily anticipated. The agility of the Data Web, if correctly understood, can be of great benefit there, especially on the reporting and business intelligence side. Specifically mapping line-of-business systems into RDF on the fly will help with integration, making the specialized warehouse the slower and more expensive alternative. But this too is needed at times. But for the RDF community to be taken seriously there, the messaging must be geared in this direction. Writing FOAF files by hand is not where you begin the pitch. Well, what is more natural then having a global, queriable information space, when you have a global information driven economy? The Data Web is about making this happen. First with doing this in published generally available data; next with the enterprises having their private data for their own use but still linking toward the outside, even though private data stays private: You can still use standard terms and taxonomies, where they apply, when talking of proprietary information. But let&#39;s get back to more specific issues At the lightning talks in Vienna, one participant said, &quot;Man&#39;s enemy is not the lion that eats men, it&#39;s his own brother. Semantic Web&#39;s enemy is the XML Web services stack that ate its lunch.&quot; There is some truth to the first part. The second part deserves some comment. The Web services stack is about transactions. When you have a fixed, often repeating task, it is a natural thing to make this a Web service. Even though SOA is not really prevalent in enterprise IT, it has value in things like managing supply-chain logistics with partners, etc. Lots of standard messages with unambiguous meaning. To make a parallel with the database world: first there was OLTP; then there was business intelligence. Of course, you must first have the transactions, to have something to analyze. SOA is for the transactions; the Data Web is for integration, analysis, and discovery. It is the ad hoc component of the real time enterprise, if you will. It is not a competitor against a transaction oriented SOA. In fact, RDF has no special genius for transactions. Another mistake that often gets made is stretching things beyond their natural niche. Doing transactions in RDF is this sort of over-stretching without real benefit. &quot;I made an ontology and it really did solve a problem. How do I convince the enterprise people, the MBA who says it&#39;s too complex, the developer who says it is not what he&#39;s used to, and so on?&quot; This is an education question. One of the findings of SWEO&#39;s enterprise survey was that there was awareness that difficult problems existed. There were and are corporate ontologies and taxonomies, diversely implemented. Some of these needs are recognized. RDF based technologies offer to make these more open standards based. open standards have proven economical in the past. What we also hear is that major enterprises do not even know what their information and human resources assets are: Experts can&#39;t be found even when they are in the next department, or reports and analysis gets buried in wikis, spreadsheets, and emails. Just as when SQL took off, we need vendors to do workshops on getting started with a technology. The affair in Vienna was a step in this direction. Another type of event specially focusing on vertical problems and their Data Web solutions is a next step. For example, one could do a workshop on integrating supply chain information with Data Web technologies. Or one on making enterprise knowledge bases from HR, CRM, office automation, wikis, etc. The good thing is that all these things are additions to, not replacements of, the existing mission-critical infrastructure. And better use of what you already have ought to be the theme of the day.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>I was in <a href="http://dbpedia.org/resource/Vienna" id="link-id0x1f18a540">Vienna</a> for the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x1ec788a0">Linked Data</a> Practitioners gathering this week. Danny Ayers asked me if I would <a href="http://dbpedia.org/resource/Blog" id="link-id0x20838238">blog</a> about the State of the <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0x20694ed8">Semantic Web</a> or write the <i>This Week&#39;s Semantic Web</i> column. I don&#39;t have the time to cover all that may have happened during the past week but I will editorialize about the questions that again were raised in Vienna. How these things relate to <a href="http://virtuoso.openlinksw.com" id="link-id0x20b1cd38">Virtuoso</a> will be covered separately. This is about the overarching questions of the times, not the finer points of geek craft.</p>
<p>
<a href="http://www.informatik.uni-leipzig.de/~auer/foaf.rdf#me" id="link-id0x1ff31b30">SÃ¶ren Auer</a> asked me to say a few things about relational to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1f8118e0">RDF</a> mapping. I will cite some highlights from this, as they pertain to the general scene. There was an &quot;open hacking&quot; session Wednesday night featuring lightning talks. I will use some of these too as a starting point.</p>
<h3>The messaging?</h3>
<p>The <a href="http://www.w3.org/2001/sw/sweo/" id="link-id0x1dc39210">SWEO</a> (Semantic Web Education and Outreach) interest group of the W3C spent some time looking for an elevator pitch for the Semantic Web. It became &quot;<a href="http://dbpedia.org/resource/Data" id="link-id0x1f24dd98">Data</a> Unleashed.&quot; Why not? Let&#39;s give this some context.</p>
<p>So, if we are holding a <i>Semantic Web 101</i> session, where should we begin? I hazard to guess that we should not begin by writing a FOAF file in Turtle by hand, as this is one thing that is not likely to happen in the real world.</p>
<p>Of course, the social aspect of the Data Web is the most immediately engaging, so a demo might be to go make an account with <a href="http://myopenlink.net/" id="link-id0x1f5e0198">myopenlink</a>.<a href="http://dbpedia.org/resource/.NET_Framework" id="link-id0x1ec49a00">net</a> and see that after one has entered the data one normally enters for any social network, one has become a Data Web citizen. This means that one can be found, just like this, with a query against the set of data spaces hosted on the system. Then we just need a few pages that repurpose this data and relate it to other data. We show some samples of queries like this in our <a href="http://challenge.semanticweb.org/" id="link-id0x1ee35f70">Billion Triples Challenge</a> demo. We will make a webcast about this to make it all clearer.</p>
<p>Behold: The Data Web is about the world becoming a database; writing <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x20644808">SPARQL</a> queries or triples is incidental. You will write FOAF files by hand just as little as you now write <a href="http://dbpedia.org/resource/SQL" id="link-id0x1fd9fbc0">SQL</a> insert statements for filling in your account <a href="http://dbpedia.org/resource/Information" id="link-id0x1dfd3540">information</a> on Myspace.</p>
<p>Every time there is a major shift in technology, this shift needs to be motivated by addressing a new class of problem. This means doing something that could not be done before. The last time this happened was when the relational database became the dominant IT technology. At that time, the questions involved putting the enterprise in the database and building a cluster of Line Of Business (LOB) applications around the database. The argument for the <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id0x1e920868">RDBMS</a> was that you did not have to constrain the set of queries that might later be made, when designing the database. In other words, it was making things more <i>ad hoc</i>. This was opposed then on grounds of being less efficient than the hierarchical and network databases which the relational eventually replaced.</p>
<p>Today, the point of the Data Web is that you do not have to constrain what your data can join or integrate with, when you design your database. The counter-argument is that this is slow and geeky and not scalable. See the similarity?</p>
<p>A difference is that we are not specifically aiming at replacing the RDBMS. In fact, if you know exactly what you will query and have a well defined workload, a relational representation optimized for the workload will give you about 10x the performance of the equivalent RDF warehouse. OLTP remains a relational-only domain.</p>
<p>However, when we are talking about doing queries and analytics against the Web, or even against more than a handful of relational systems, the things which make RDBMS good become problematic.</p>
<h3>What is the business value of this?</h3>
<p>The most reliable of human drives is the drive to make oneself known. This drives all, from any social scene to business communications to politics. Today, when you want to proclaim you exist, you do so first on the Web. The Web did not become the prevalent media because business loved it for its own sake, it became prevalent because business could not afford not to assert their presence there. If anything, the Web eroded the communications dominance of a lot of players, which was not welcome but still had to be dealt with, by embracing the Web.</p>
<p>Today, in a world driven by data, the Data Web will be catalyzed by similar factors: If your data is not there, you will not figure in query results. Search engines will play some role there but also many social applications will have reports that are driven by published data. Also consider any e-commerce, any marketplace, and so forth. The Data Portability movement is a case in point: Users want to own their own content; silo operators want to capitalize on holding it. Right now, we see these things in silos; the Data Web will create bridges between these, and what is now in silo data centers will be increasingly available on an ad hoc basis with Open Data.</p>
<p>Again, we see a movement from the specialized to the generic: What LinkedIn does in its data center can be done with ad hoc queries with <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id0x1e715138">linked open data</a>. Of course, LinkedIn does these things somewhat more efficiently because their system is built just for this task, but the linked data approach has the built-in readiness to join with everything else at almost no cost, without making a new data warehouse for each new business question.</p>
<p>We could call this the sociological aspect of the thing. Getting to more concrete business, we see an economy that, we could say, without being alarmists, is confronted with some issues. Well, generally when times are bad, this results in consolidation of property and power. Businesses fail and get split up and sold off in pieces, government adds controls and regulations and so forth. This means ad hoc data integration, as control without data is just pretense. If times are lean, this also means that there is little readiness to do wholesale replacement of systems, which will take years before producing anything. So we must play with what there is and make it deliver, in ways and conditions that were not necessarily anticipated. The agility of the Data Web, if correctly understood, can be of great benefit there, especially on the reporting and business intelligence side. Specifically mapping line-of-business systems into RDF on the fly will help with integration, making the specialized warehouse the slower and more expensive alternative. But this too is needed at times.</p>
<p>But for the RDF community to be taken seriously there, the messaging must be geared in this direction. Writing FOAF files by hand is not where you begin the pitch. Well, what is more natural then having a global, queriable information space, when you have a global information driven economy?</p>
<p>The Data Web is about making this happen. First with doing this in published generally available data; next with the enterprises having their private data for their own use but still linking toward the outside, even though private data stays private: You can still use standard terms and taxonomies, where they apply, when talking of proprietary information.</p>
<h3>But let&#39;s get back to more specific issues</h3>
<p>At the lightning talks in Vienna, one participant said, &quot;Man&#39;s enemy is not the lion that eats men, it&#39;s his own brother. Semantic Web&#39;s enemy is the <a href="http://dbpedia.org/resource/XML" id="link-id0x1aeb61b8">XML</a> Web services stack that ate its lunch.&quot; There is some truth to the first part. The second part deserves some comment. The Web services stack is about transactions. When you have a fixed, often repeating task, it is a natural thing to make this a Web service. Even though SOA is not really prevalent in enterprise IT, it has value in things like managing supply-chain logistics with partners, etc. Lots of standard messages with unambiguous meaning. To make a parallel with the database world: first there was OLTP; then there was business intelligence. Of course, you must first have the transactions, to have something to analyze.</p>
<p>SOA is for the transactions; the Data Web is for integration, analysis, and discovery. It is the <i>ad hoc</i> component of the real time enterprise, if you will. It is not a competitor against a transaction oriented SOA. In fact, RDF has no special genius for transactions. Another mistake that often gets made is stretching things beyond their natural niche. Doing transactions in RDF is this sort of over-stretching without real benefit.</p>
<p>&quot;I made an ontology and it really did solve a problem. How do I convince the enterprise people, the MBA who says it&#39;s too complex, the developer who says it is not what he&#39;s used to, and so on?&quot;</p>
<p>This is an education question. One of the findings of SWEO&#39;s enterprise survey was that there was awareness that difficult problems existed. There were and are corporate ontologies and taxonomies, diversely implemented. Some of these needs are recognized. RDF based technologies offer to make these more open standards based. open standards have proven economical in the past. What we also hear is that major enterprises do not even know what their information and human resources assets are: Experts can&#39;t be found even when they are in the next department, or reports and analysis gets buried in wikis, spreadsheets, and emails.</p>
<p>Just as when SQL took off, we need vendors to do workshops on getting started with a technology. The affair in Vienna was a step in this direction. Another type of event specially focusing on vertical problems and their Data Web solutions is a next step. For example, one could do a workshop on integrating supply chain information with Data Web technologies. Or one on making enterprise <a href="http://dbpedia.org/resource/Knowledge" id="link-id0x1fbd3398">knowledge</a> bases from HR, CRM, office automation, wikis, etc. The good thing is that all these things are additions to, not replacements of, the existing mission-critical infrastructure. And better use of what you already have ought to be the theme of the day.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-10-24#1459">
  <rss:title>State of the Semantic Web, Part 1 - Sociology, Business, and Messaging (update 2)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-10-24T10:19:03Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I was in Vienna for the Linked Data Practitioners gathering this week. Danny Ayers asked me if I would blog about the State of the Semantic Web or write the This Week&#39;s Semantic Web column. I don&#39;t have the time to cover all that may have happened during the past week but I will editorialize about the questions that again were raised in Vienna. How these things relate to Virtuoso will be covered separately. This is about the overarching questions of the times, not the finer points of geek craft. SÃ¶ren Auer asked me to say a few things about relational to RDF mapping. I will cite some highlights from this, as they pertain to the general scene. There was an &quot;open hacking&quot; session Wednesday night featuring lightning talks. I will use some of these too as a starting point. The messaging? The SWEO (Semantic Web Education and Outreach) interest group of the W3C spent some time looking for an elevator pitch for the Semantic Web. It became &quot;Data Unleashed.&quot; Why not? Let&#39;s give this some context. So, if we are holding a Semantic Web 101 session, where should we begin? I hazard to guess that we should not begin by writing a FOAF file in Turtle by hand, as this is one thing that is not likely to happen in the real world. Of course, the social aspect of the Data Web is the most immediately engaging, so a demo might be to go make an account with myopenlink.net and see that after one has entered the data one normally enters for any social network, one has become a Data Web citizen. This means that one can be found, just like this, with a query against the set of data spaces hosted on the system. Then we just need a few pages that repurpose this data and relate it to other data. We show some samples of queries like this in our Billion Triples Challenge demo. We will make a webcast about this to make it all clearer. Behold: The Data Web is about the world becoming a database; writing SPARQL queries or triples is incidental. You will write FOAF files by hand just as little as you now write SQL insert statements for filling in your account information on Myspace. Every time there is a major shift in technology, this shift needs to be motivated by addressing a new class of problem. This means doing something that could not be done before. The last time this happened was when the relational database became the dominant IT technology. At that time, the questions involved putting the enterprise in the database and building a cluster of Line Of Business (LOB) applications around the database. The argument for the RDBMS was that you did not have to constrain the set of queries that might later be made, when designing the database. In other words, it was making things more ad hoc. This was opposed then on grounds of being less efficient than the hierarchical and network databases which the relational eventually replaced. Today, the point of the Data Web is that you do not have to constrain what your data can join or integrate with, when you design your database. The counter-argument is that this is slow and geeky and not scalable. See the similarity? A difference is that we are not specifically aiming at replacing the RDBMS. In fact, if you know exactly what you will query and have a well defined workload, a relational representation optimized for the workload will give you about 10x the performance of the equivalent RDF warehouse. OLTP remains a relational-only domain. However, when we are talking about doing queries and analytics against the Web, or even against more than a handful of relational systems, the things which make RDBMS good become problematic. What is the business value of this? The most reliable of human drives is the drive to make oneself known. This drives all, from any social scene to business communications to politics. Today, when you want to proclaim you exist, you do so first on the Web. The Web did not become the prevalent media because business loved it for its own sake, it became prevalent because business could not afford not to assert their presence there. If anything, the Web eroded the communications dominance of a lot of players, which was not welcome but still had to be dealt with, by embracing the Web. Today, in a world driven by data, the Data Web will be catalyzed by similar factors: If your data is not there, you will not figure in query results. Search engines will play some role there but also many social applications will have reports that are driven by published data. Also consider any e-commerce, any marketplace, and so forth. The Data Portability movement is a case in point: Users want to own their own content; silo operators want to capitalize on holding it. Right now, we see these things in silos; the Data Web will create bridges between these, and what is now in silo data centers will be increasingly available on an ad hoc basis with Open Data. Again, we see a movement from the specialized to the generic: What LinkedIn does in its data center can be done with ad hoc queries with linked open data. Of course, LinkedIn does these things somewhat more efficiently because their system is built just for this task, but the linked data approach has the built-in readiness to join with everything else at almost no cost, without making a new data warehouse for each new business question. We could call this the sociological aspect of the thing. Getting to more concrete business, we see an economy that, we could say, without being alarmists, is confronted with some issues. Well, generally when times are bad, this results in consolidation of property and power. Businesses fail and get split up and sold off in pieces, government adds controls and regulations and so forth. This means ad hoc data integration, as control without data is just pretense. If times are lean, this also means that there is little readiness to do wholesale replacement of systems, which will take years before producing anything. So we must play with what there is and make it deliver, in ways and conditions that were not necessarily anticipated. The agility of the Data Web, if correctly understood, can be of great benefit there, especially on the reporting and business intelligence side. Specifically mapping line-of-business systems into RDF on the fly will help with integration, making the specialized warehouse the slower and more expensive alternative. But this too is needed at times. But for the RDF community to be taken seriously there, the messaging must be geared in this direction. Writing FOAF files by hand is not where you begin the pitch. Well, what is more natural then having a global, queriable information space, when you have a global information driven economy? The Data Web is about making this happen. First with doing this in published generally available data; next with the enterprises having their private data for their own use but still linking toward the outside, even though private data stays private: You can still use standard terms and taxonomies, where they apply, when talking of proprietary information. But let&#39;s get back to more specific issues At the lightning talks in Vienna, one participant said, &quot;Man&#39;s enemy is not the lion that eats men, it&#39;s his own brother. Semantic Web&#39;s enemy is the XML Web services stack that ate its lunch.&quot; There is some truth to the first part. The second part deserves some comment. The Web services stack is about transactions. When you have a fixed, often repeating task, it is a natural thing to make this a Web service. Even though SOA is not really prevalent in enterprise IT, it has value in things like managing supply-chain logistics with partners, etc. Lots of standard messages with unambiguous meaning. To make a parallel with the database world: first there was OLTP; then there was business intelligence. Of course, you must first have the transactions, to have something to analyze. SOA is for the transactions; the Data Web is for integration, analysis, and discovery. It is the ad hoc component of the real time enterprise, if you will. It is not a competitor against a transaction oriented SOA. In fact, RDF has no special genius for transactions. Another mistake that often gets made is stretching things beyond their natural niche. Doing transactions in RDF is this sort of over-stretching without real benefit. &quot;I made an ontology and it really did solve a problem. How do I convince the enterprise people, the MBA who says it&#39;s too complex, the developer who says it is not what he&#39;s used to, and so on?&quot; This is an education question. One of the findings of SWEO&#39;s enterprise survey was that there was awareness that difficult problems existed. There were and are corporate ontologies and taxonomies, diversely implemented. Some of these needs are recognized. RDF based technologies offer to make these more open standards based. open standards have proven economical in the past. What we also hear is that major enterprises do not even know what their information and human resources assets are: Experts can&#39;t be found even when they are in the next department, or reports and analysis gets buried in wikis, spreadsheets, and emails. Just as when SQL took off, we need vendors to do workshops on getting started with a technology. The affair in Vienna was a step in this direction. Another type of event specially focusing on vertical problems and their Data Web solutions is a next step. For example, one could do a workshop on integrating supply chain information with Data Web technologies. Or one on making enterprise knowledge bases from HR, CRM, office automation, wikis, etc. The good thing is that all these things are additions to, not replacements of, the existing mission-critical infrastructure. And better use of what you already have ought to be the theme of the day.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>I was in <a href="http://dbpedia.org/resource/Vienna" id="link-id0x28471870">Vienna</a> for the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x26f0ec28">Linked Data</a> Practitioners gathering this week. Danny Ayers asked me if I would <a href="http://dbpedia.org/resource/Blog" id="link-id0x26cf7678">blog</a> about the State of the <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0x273087e0">Semantic Web</a> or write the <i>This Week&#39;s Semantic Web</i> column. I don&#39;t have the time to cover all that may have happened during the past week but I will editorialize about the questions that again were raised in Vienna. How these things relate to <a href="http://virtuoso.openlinksw.com" id="link-id0x264e11b8">Virtuoso</a> will be covered separately. This is about the overarching questions of the times, not the finer points of geek craft.</p>
<p>
<a href="http://www.informatik.uni-leipzig.de/~auer/foaf.rdf#me" id="link-id0x2787de70">SÃ¶ren Auer</a> asked me to say a few things about relational to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x280b12f8">RDF</a> mapping. I will cite some highlights from this, as they pertain to the general scene. There was an &quot;open hacking&quot; session Wednesday night featuring lightning talks. I will use some of these too as a starting point.</p>
<h3>The messaging?</h3>
<p>The <a href="http://www.w3.org/2001/sw/sweo/" id="link-id0x28078030">SWEO</a> (Semantic Web Education and Outreach) interest group of the W3C spent some time looking for an elevator pitch for the Semantic Web. It became &quot;<a href="http://dbpedia.org/resource/Data" id="link-id0x290a48c0">Data</a> Unleashed.&quot; Why not? Let&#39;s give this some context.</p>
<p>So, if we are holding a <i>Semantic Web 101</i> session, where should we begin? I hazard to guess that we should not begin by writing a FOAF file in Turtle by hand, as this is one thing that is not likely to happen in the real world.</p>
<p>Of course, the social aspect of the Data Web is the most immediately engaging, so a demo might be to go make an account with <a href="http://myopenlink.net/" id="link-id0x272ed6d0">myopenlink</a>.<a href="http://dbpedia.org/resource/.NET_Framework" id="link-id0x277dbbd0">net</a> and see that after one has entered the data one normally enters for any social network, one has become a Data Web citizen. This means that one can be found, just like this, with a query against the set of data spaces hosted on the system. Then we just need a few pages that repurpose this data and relate it to other data. We show some samples of queries like this in our <a href="http://challenge.semanticweb.org/" id="link-id0x25fda5c8">Billion Triples Challenge</a> demo. We will make a webcast about this to make it all clearer.</p>
<p>Behold: The Data Web is about the world becoming a database; writing <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x278c3878">SPARQL</a> queries or triples is incidental. You will write FOAF files by hand just as little as you now write <a href="http://dbpedia.org/resource/SQL" id="link-id0x27e6be18">SQL</a> insert statements for filling in your account <a href="http://dbpedia.org/resource/Information" id="link-id0x2727a278">information</a> on Myspace.</p>
<p>Every time there is a major shift in technology, this shift needs to be motivated by addressing a new class of problem. This means doing something that could not be done before. The last time this happened was when the relational database became the dominant IT technology. At that time, the questions involved putting the enterprise in the database and building a cluster of Line Of Business (LOB) applications around the database. The argument for the <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id0x26020128">RDBMS</a> was that you did not have to constrain the set of queries that might later be made, when designing the database. In other words, it was making things more <i>ad hoc</i>. This was opposed then on grounds of being less efficient than the hierarchical and network databases which the relational eventually replaced.</p>
<p>Today, the point of the Data Web is that you do not have to constrain what your data can join or integrate with, when you design your database. The counter-argument is that this is slow and geeky and not scalable. See the similarity?</p>
<p>A difference is that we are not specifically aiming at replacing the RDBMS. In fact, if you know exactly what you will query and have a well defined workload, a relational representation optimized for the workload will give you about 10x the performance of the equivalent RDF warehouse. OLTP remains a relational-only domain.</p>
<p>However, when we are talking about doing queries and analytics against the Web, or even against more than a handful of relational systems, the things which make RDBMS good become problematic.</p>
<h3>What is the business value of this?</h3>
<p>The most reliable of human drives is the drive to make oneself known. This drives all, from any social scene to business communications to politics. Today, when you want to proclaim you exist, you do so first on the Web. The Web did not become the prevalent media because business loved it for its own sake, it became prevalent because business could not afford not to assert their presence there. If anything, the Web eroded the communications dominance of a lot of players, which was not welcome but still had to be dealt with, by embracing the Web.</p>
<p>Today, in a world driven by data, the Data Web will be catalyzed by similar factors: If your data is not there, you will not figure in query results. Search engines will play some role there but also many social applications will have reports that are driven by published data. Also consider any e-commerce, any marketplace, and so forth. The Data Portability movement is a case in point: Users want to own their own content; silo operators want to capitalize on holding it. Right now, we see these things in silos; the Data Web will create bridges between these, and what is now in silo data centers will be increasingly available on an ad hoc basis with Open Data.</p>
<p>Again, we see a movement from the specialized to the generic: What LinkedIn does in its data center can be done with ad hoc queries with <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id0x261c7bc8">linked open data</a>. Of course, LinkedIn does these things somewhat more efficiently because their system is built just for this task, but the linked data approach has the built-in readiness to join with everything else at almost no cost, without making a new data warehouse for each new business question.</p>
<p>We could call this the sociological aspect of the thing. Getting to more concrete business, we see an economy that, we could say, without being alarmists, is confronted with some issues. Well, generally when times are bad, this results in consolidation of property and power. Businesses fail and get split up and sold off in pieces, government adds controls and regulations and so forth. This means ad hoc data integration, as control without data is just pretense. If times are lean, this also means that there is little readiness to do wholesale replacement of systems, which will take years before producing anything. So we must play with what there is and make it deliver, in ways and conditions that were not necessarily anticipated. The agility of the Data Web, if correctly understood, can be of great benefit there, especially on the reporting and business intelligence side. Specifically mapping line-of-business systems into RDF on the fly will help with integration, making the specialized warehouse the slower and more expensive alternative. But this too is needed at times.</p>
<p>But for the RDF community to be taken seriously there, the messaging must be geared in this direction. Writing FOAF files by hand is not where you begin the pitch. Well, what is more natural then having a global, queriable information space, when you have a global information driven economy?</p>
<p>The Data Web is about making this happen. First with doing this in published generally available data; next with the enterprises having their private data for their own use but still linking toward the outside, even though private data stays private: You can still use standard terms and taxonomies, where they apply, when talking of proprietary information.</p>
<h3>But let&#39;s get back to more specific issues</h3>
<p>At the lightning talks in Vienna, one participant said, &quot;Man&#39;s enemy is not the lion that eats men, it&#39;s his own brother. Semantic Web&#39;s enemy is the <a href="http://dbpedia.org/resource/XML" id="link-id0x26273118">XML</a> Web services stack that ate its lunch.&quot; There is some truth to the first part. The second part deserves some comment. The Web services stack is about transactions. When you have a fixed, often repeating task, it is a natural thing to make this a Web service. Even though SOA is not really prevalent in enterprise IT, it has value in things like managing supply-chain logistics with partners, etc. Lots of standard messages with unambiguous meaning. To make a parallel with the database world: first there was OLTP; then there was business intelligence. Of course, you must first have the transactions, to have something to analyze.</p>
<p>SOA is for the transactions; the Data Web is for integration, analysis, and discovery. It is the <i>ad hoc</i> component of the real time enterprise, if you will. It is not a competitor against a transaction oriented SOA. In fact, RDF has no special genius for transactions. Another mistake that often gets made is stretching things beyond their natural niche. Doing transactions in RDF is this sort of over-stretching without real benefit.</p>
<p>&quot;I made an ontology and it really did solve a problem. How do I convince the enterprise people, the MBA who says it&#39;s too complex, the developer who says it is not what he&#39;s used to, and so on?&quot;</p>
<p>This is an education question. One of the findings of SWEO&#39;s enterprise survey was that there was awareness that difficult problems existed. There were and are corporate ontologies and taxonomies, diversely implemented. Some of these needs are recognized. RDF based technologies offer to make these more open standards based. open standards have proven economical in the past. What we also hear is that major enterprises do not even know what their information and human resources assets are: Experts can&#39;t be found even when they are in the next department, or reports and analysis gets buried in wikis, spreadsheets, and emails.</p>
<p>Just as when SQL took off, we need vendors to do workshops on getting started with a technology. The affair in Vienna was a step in this direction. Another type of event specially focusing on vertical problems and their Data Web solutions is a next step. For example, one could do a workshop on integrating supply chain information with Data Web technologies. Or one on making enterprise <a href="http://dbpedia.org/resource/Knowledge" id="link-id0x260172a8">knowledge</a> bases from HR, CRM, office automation, wikis, etc. The good thing is that all these things are additions to, not replacements of, the existing mission-critical infrastructure. And better use of what you already have ought to be the theme of the day.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-10-01#1447">
  <rss:title>Where Are All the RDF-based Semantic Web Applications?</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-10-01T23:09:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">In response to the &quot;Semantic Web Technology&quot; application classification scheme espoused by ReadWriteWeb (RWW), emphasized in the post titled: Where are all the RDF-based Semantic Web Apps?, here is my attempt to clarify and reintroduce what OpenLink Software offers (today) in relation to Semantic Web technology. From the RWW Top-Down category, which I interpret as: technologies that produce RDF from non RDF data sources. Our product portfolio is comprised of the following; Virtuoso Universal Server, OpenLink Data Spaces, OpenLink Ajax Toolkit, and OpenLink Data Explorer (which includes ubiquity commands). Virtuoso Universal Server functionality summary: Generation of RDF Linked Data Views of SQL, XML, and Web Services in general Deployment of RDF Linked Data &quot;On the Fly&quot; generation of RDF Linked Data from Document Web information resources (i.e. distillation of entities from their containers e.g. Web pages) via Cartridges / Drivers SPARQL query language support SPARQL extensions that bring SPARQL closer to SQL e.g Aggregates, Update, Insert, Delete Named Graph support (i.e. use of logical names to partition RDF data within Virtuoso&#39;s multi-model dbms engine) Inference Engine (currently in use re. DBpedia via Yago and UMBEL) Host and exposes data from Drupal, Wordpress, MediaWiki, phpBB3 as RDF Linked Data via in-built support for PHP runtime Available as an EC2 AMI etc.. OpenLink Data Spaces functionality summary: Simple mechanism for Linked Data Web enabling yourself by giving you an HTTP based User ID (a de-referencable URI) that is linked to a FOAF based Profile page and OpenID Binds all your data sources (blogs, wikis, bookmarks, photos, calendar items etc. ) to your URI so can &quot;Find&quot; things by only remembering your URI Makes your profile page and personal URI the focal point of Linked Data Web presence Delivers Data Portability (using data access by value or data access by reference) across data silos (e.g. Web 2.0 style social networks) Allows you make annotations about anything in your own Data Space(s) on the Web without exposure to RDF markup A Briefcase feature that provides a WebDAV driven RDF Linked Data variant of functionality seen in Mac OS X Spotlight and WinFS with the addition of SPARQL compliance Automatically generates RDFa in its (X)HTML pages Blog, Wiki, WebDAV File Server, Shared Bookmarks, Calendar, and other applications that look and feel like Web 2.0 counterparts but emitt RDF Linked Data amongst a plethora of data exchange formats Available as an EC2 AMI etc.. OpenLink Ajax Toolkit functionality summary: Provides binding to SQL, RDF, XML, and Web Services via Ajax Database Connectivity Layer (you only need an ODBC, JDBC, OLE-DB, ADO.NET, XMLA Driver, or Web Service on the backend for dynamic data access from Javascript) All controls are Ajax Database Connectivity bound (widgets get their data from Ajax Database Connectivity data sources) Bundled with Virtuoso and ODS installations. etc. OpenLink Data Explorer functionality summary Distills entities associated with information resource style containers (e.g. Web Pages or files) as RDF Linked Data Exposes the RDF based Linked Data graph associated with information resources (see the Linked Data behind Web pages) Ubiquity commands for invoking the above Available as a Hosted Service or Firefox Extension Bundled with Virtuoso and ODS installations etc. Note: Of course you could have simply looked up OpenLink Software&#39;s FOAF based Profile page (*note the Linked Data Explorer tab*), or simply passed the FOAF profile page URL to a Linked Data aware client application such as: OpenLink Data Explorer, Zitgist Data Viewer, Marbles, and Tabulator, and obtained information. Remember, OpenLink Software is an Entity of Type: foaf:Organization, on the burgeoning Linked Data Web :-) Related Linked Data Planet Keynote (RDFa based remix edition) On The Cusp: A Global Review of the Semantic Web Industry.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
In response to the &quot;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id15971040">Semantic Web</a> Technology&quot; application classification scheme espoused by <a href="http://www.readwriteweb.com" id="link-id16391540">ReadWriteWeb</a> (RWW), emphasized in the post titled:  <a href="http://www.readwriteweb.com/archives/rdf_semantic_web_apps.php" id="link-id1157eaa0">Where are all the RDF-based Semantic Web Apps?</a>, here is my attempt to clarify and reintroduce what <a href="http://www.openlinksw.com/dataspace/organization/openlink#this" id="link-id15a43758">OpenLink Software</a> offers (today) in relation to Semantic Web technology.
</p>
<p>
From the RWW Top-Down category, which I interpret as: technologies that produce RDF from non RDF <a href="http://dbpedia.org/resource/Data">data</a> sources. Our product portfolio is comprised of the following; <a href="http://virtuoso.openlinksw.com" id="link-id14f05818">Virtuoso Universal Server</a>, <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id162c8630">OpenLink Data Spaces</a>, <a href="http://oat.openlinksw.com" id="link-id134e1a00">OpenLink Ajax Toolkit</a>, and <a href="http://ode.openlinksw.com" id="link-id160b3bf8">OpenLink Data Explorer</a> (which includes ubiquity commands).</p>

<h3>Virtuoso Universal Server functionality summary:</h3>

<ol>
  <li>Generation of RDF <a href="http://dbpedia.org/resource/Linked_Data" id="link-id161d5f50">Linked Data</a> Views of <a href="http://dbpedia.org/resource/SQL" id="link-id161d5978">SQL</a>, XML, and <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> Services in general </li>
  <li>Deployment of RDF Linked Data </li>
  <li>&quot;On the Fly&quot; generation of RDF Linked Data from Document Web <a href="http://www.w3.org/TR/webarch/" id="link-id178bbc08">information resources</a> (i.e. distillation of entities from their containers e.g. Web pages) via Cartridges / Drivers</li>
  <li>
  <a href="http://dbpedia.org/resource/SPARQL" id="link-id162c2118">SPARQL</a> query language support </li>
  <li>SPARQL extensions that bring SPARQL closer to SQL e.g Aggregates, Update, Insert, Delete
    Named Graph support (i.e. use of logical names to partition RDF data within Virtuoso&#39;s multi-model dbms engine)    </li>
  <li>Inference Engine (currently in use re. <a href="http://dbpedia.org/resource/DBpedia" id="link-id14f563c0">DBpedia</a> via Yago and <a href="http://umbel.org/about/" id="link-id113273b8">UMBEL</a>)</li>
  <li>Host and exposes data from <a href="http://dbpedia.org/resource/Drupal" id="link-id123d3bd8">Drupal</a>, <a href="http://dbpedia.org/resource/WordPress" id="link-id141adf40">Wordpress</a>, <a href="http://dbpedia.org/resource/MediaWiki" id="link-id1604b450">MediaWiki</a>, <a href="http://dbpedia.org/resource/PhpBB" id="link-id141013a8">phpBB3</a> as RDF Linked Data via in-built support for <a href="http://dbpedia.org/resource/PHP" id="link-id14661e58">PHP</a> runtime</li>
  <li>
  <a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/ODSInstallationEC2" id="link-id146c84d0">Available as an EC2 AMI</a>
</li>
  <li>etc..</li>
</ol>
<h3>OpenLink Data Spaces functionality summary:</h3>
<ol>
  <li>Simple mechanism for Linked Data <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id15473770">Web</a> enabling yourself by giving you an <a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/GetAPersonalURIIn5MinutesOrLess" id="link-id15f6d278">HTTP based User ID</a> (a de-referencable <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id15aaeb68">URI</a>) that is linked to a <a href="http://myopenlink.net/dataspace/person/kidehen" id="link-id15a7a840">FOAF based Profile page</a> and OpenID</li>
  <li>Binds all your data sources (blogs, wikis, bookmarks, photos, calendar items etc. ) to your URI so can &quot;Find&quot; things by only remembering your URI</li>
  <li>Makes your profile page and personal URI the focal point of Linked Data Web presence</li>
  <li>Delivers Data Portability (using data access by value or <a href="http://dbpedia.org/resource/Reference_(computer_science)" id="link-id16212838">data access by reference</a>) across data silos (e.g. Web 2.0 style social networks)</li>
  <li>Allows you make annotations about anything in your own <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id14668010">Data Space</a>(s) on the Web without exposure to RDF markup</li>
  <li>A Briefcase feature that provides a WebDAV driven RDF Linked Data variant of functionality seen in Mac OS X Spotlight and WinFS with the addition of SPARQL compliance</li>
  <li>Automatically generates <a href="http://dbpedia.org/resource/RDFa" id="link-id14691440">RDFa</a> in its (X)HTML pages</li>
  <li>
  <a href="http://dbpedia.org/resource/Blog" id="link-id14fae7b8">Blog</a>, Wiki, WebDAV File Server, Shared Bookmarks, Calendar, and other applications that look and feel like Web 2.0 counterparts but emitt RDF Linked Data amongst a plethora of data exchange formats</li>
  <li>Available as an EC2 AMI</li>
  <li>etc..</li>
</ol>
<h3>OpenLink Ajax Toolkit functionality summary:</h3>
<ol>
  <li>Provides binding to SQL, RDF, XML, and Web Services via Ajax Database Connectivity Layer (you only need an <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id11550548">ODBC</a>, <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id13ae5f68">JDBC</a>, OLE-DB, <a href="http://dbpedia.org/resource/ADO.NET" id="link-id162803e8">ADO</a>.NET,  XMLA Driver, or Web Service on the backend for dynamic data access from Javascript)</li>
  <li>All controls are Ajax Database Connectivity bound (widgets get their data from Ajax Database Connectivity data sources)</li>
  <li>Bundled with Virtuoso and <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id161dfe90">ODS</a> installations.</li>
  <li>etc.</li>
</ol>
<h3>OpenLink Data Explorer functionality summary</h3>
<ol>
  <li>Distills entities associated with information resource style containers (e.g. Web Pages or files) as RDF Linked Data</li>
  <li>Exposes the RDF based <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id12a42ed8">Linked Data graph</a> associated with information resources (see the Linked Data behind Web pages)</li>
  <li>Ubiquity commands for invoking the above</li>
  <li>Available as a <a href="http://linkeddata.uriburner.com/ode" id="link-id15a0d2b0">Hosted Service</a> or <a href="http://ode.openlinksw.com" id="link-id138b9fa8">Firefox Extension</a>
</li>
  <li>Bundled with Virtuoso and ODS installations</li>
  <li>etc.</li>
</ol>
<h3>Note:</h3>
<p>Of course you could have simply looked up <a href="http://www.openlinksw.com/dataspace/organization/openlink" id="link-id14ef2c10">OpenLink Software&#39;s FOAF based Profile page</a> (*note the Linked Data Explorer tab*), or simply passed the <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id14cbf5c8">FOAF</a> profile page <a href="http://dbpedia.org/resource/Uniform_Resource_Locator" id="link-id16453e28">URL</a> to a Linked Data aware client application such as: <a href="http://linkeddata.uriburner.com/ode" id="link-id15a80500">OpenLink Data Explorer</a>, <a href="http://zitgist.com/about/" id="link-id1586a360">Zitgist</a> <a href="http://dataviewer.zitgist.com" id="link-id16249f60">Data Viewer</a>, <a href="http://beckr.org/marbles" id="link-id15993fb0">Marbles</a>, and <a href="http://dig.csail.mit.edu/2005/ajar/release/tabulator/0.8/tab.html" id="link-id14d63048">Tabulator</a>, and obtained information. Remember, <a href="http://www.openlinksw.com/dataspace/organization/openlink#this" id="link-id138ba838">OpenLink Software</a> is an <a href="http://dbpedia.org/resource/Entity" id="link-id1173e120">Entity</a> of Type: <a href="http://xmlns.com/foaf/0.1/Organization" id="link-id138b87b8">foaf:Organization</a>, on the burgeoning Linked Data Web :-)</p>

<h3>Related</h3>
<ul>
<li>
  <a href="http://virtuoso.openlinksw.com/presentations/Creating_Deploying_Exploiting_Linked_Data2/Creating_Deploying_Exploiting_Linked_Data2_TimBL_v3.html" id="link-id163a0c88">Linked Data Planet Keynote</a> (RDFa based remix edition)</li>
<li>
  <a href="http://semanticbusiness.blogspot.com/2008/09/report-on-cusp-global-review-of.html" id="link-id11471a40">On The Cusp: A Global Review of the Semantic Web Industry.</a>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-09-30#1446">
  <rss:title>OpenLink Software&#39;s Virtuoso Submission to the Billion Triples Challenge</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-09-30T16:24:34Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Introduction We use Virtuoso 6 Cluster Edition to demonstrate the following: Text and structured information based lookups Analytics queries Analysis of co-occurrence of features like interests and tags. Dealing with identity of multiple IRI&#39;s (owl:sameAs) The demo is based on a set of canned SPARQL queries that can be invoked using the OpenLink Data Explorer (ODE) Firefox extension. The demo queries can also be run directly against the SPARQL end point. The demo is being worked on at the time of submission and may be shown online by appointment. Automatic annotation of the data based on named entity extraction is being worked on at the time of this submission. By the time of ISWC 2008 the set of sample queries will be enhanced with queries based on extracted named entities and their relationships in the UMBEL and Open CYC ontologies. Also examples involving owl:sameAs are being added, likewise with similarity metrics and search hit scores. The Data The database consists of the billion triples data sets and some additions like Umbel. Also the Freebase extract is newer than the challenge original. The triple count is 1115 million. In the case of web harvested resources, the data is loaded in one graph per resource. In the case of larger data sets like Dbpedia or the US census, all triples of the provenance share a data set specific graph. All string literals are additionally indexed in a full text index. No stop words are used. Most queries do not specify a graph. Thus they are evaluated against the union of all the graphs in the database. The indexing scheme is SPOG, GPOS, POGS, OPGS. All indices ending in S are bitmap indices. The Queries The demo uses Virtuoso SPARQL extensions in most queries. These extensions consist on one hand of well known SQL features like aggregation with grouping and existence and value subqueries and on the other of RDF specific features. The latter include run time RDFS and OWL inferencing support and backward chaining subclasses and transitivity. Simple Lookups sparql select ?s ?p (bif:search_excerpt (bif:vector (&#39;semantic&#39;, &#39;web&#39;), ?o)) where { ?s ?p ?o . filter (bif:contains (?o, &quot;&#39;semantic web&#39;&quot;)) } limit 10 ; This looks up triples with semantic web in the object and makes a search hit summary of the literal, highlighting the search terms. sparql select ?tp count(*) where { ?s ?p2 ?o2 . ?o2 a ?tp . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?tp order by desc 2 limit 40 ; This looks at what sorts of things are referenced by the properties of the foaf handle plaid_skirt. What are these things called? sparql select ?lbl count(*) where { ?s ?p2 ?o2 . ?o2 rdfs:label ?lbl . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?lbl order by desc 2 ; Many of these things do not have a rdfs:label. Let us use a more general concept of lable which groups dc:title, foaf:name and other name-like properties together. The subproperties are resolved at run time, there is no materialization. sparql define input:inference &#39;b3s&#39; select ?lbl count(*) where { ?s ?p2 ?o2 . ?o2 b3s:label ?lbl . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?lbl order by desc 2 ; We can list sources by the topics they contain. Below we look for graphs that mention terrorist bombing. sparql select ?g count(*) where { graph ?g { ?s ?p ?o . filter (bif:contains (?o, &quot;&#39;terrorist bombing&#39;&quot;)) } } group by ?g order by desc 2 ; Now some web 2.0 tagging of search results. The tag cloud of &quot;computer&quot; sparql select ?lbl count (*) where { ?s ?p ?o . ?o bif:contains &quot;computer&quot; . ?s sioc:topic ?tg . optional { ?tg rdfs:label ?lbl } } group by ?lbl order by desc 2 limit 40 ; This query will find the posters who talk the most about sex. sparql select ?auth count (*) where { ?d dc:creator ?auth . ?d ?p ?o filter (bif:contains (?o, &quot;sex&quot;)) } group by ?auth order by desc 2 ; Analytics We look for people who are joined by having relatively uncommon interests but do not know each other. sparql select ?i ?cnt ?n1 ?n2 ?p1 ?p2 where { { select ?i count (*) as ?cnt where { ?p foaf:interest ?i } group by ?i } filter ( ?cnt &gt; 1 &amp;&amp; ?cnt &lt; 10) . ?p1 foaf:interest ?i . ?p2 foaf:interest ?i . filter (?p1 != ?p2 &amp;&amp; !bif:exists ((select (1) where {?p1 foaf:knows ?p2 })) &amp;&amp; !bif:exists ((select (1) where {?p2 foaf:knows ?p1 }))) . ?p1 foaf:nick ?n1 . ?p2 foaf:nick ?n2 . } order by ?cnt limit 50 ; The query takes a fairly long time, mostly spent counting the interested in 25M interest triples. It then takes people that share the interest and checks that neither claims to know the other. It then sorts the results rarest interest first. The query can be written more efficently but is here just to show that database-wide scans of the population are possible ad hoc. Now we go to SQL to make a tag co-occurrence matrix. This can be used for showing a Technorati-style related tags line at the bottom of a search result page. This showcases the use of SQL together with SPARQL. The half-matrix of tags t1, t2 with the co-occurrence count at the intersection is much more efficiently done in SQL, specially since it gets updated as the data changes. This is an example of materialized intermediate results based on warehoused RDF. create table tag_count (tcn_tag iri_id_8, tcn_count int, primary key (tcn_tag)); alter index tag_count on tag_count partition (tcn_tag int (0hexffff00)); create table tag_coincidence (tc_t1 iri_id_8, tc_t2 iri_id_8, tc_count int, tc_t1_count int, tc_t2_count int, primary key (tc_t1, tc_t2)) alter index tag_coincidence on tag_coincidence partition (tc_t1 int (0hexffff00)); create index tc2 on tag_coincidence (tc_t2, tc_t1) partition (tc_t2 int (0hexffff00)); How many times each topic is mentioned? insert into tag_count select * from (sparql define output:valmode &quot;LONG&quot; select ?t count (*) as ?cnt where { ?s sioc:topic ?t } group by ?t) xx option (quietcast); Take all t1, t2 where t1 and t2 are tags of the same subject, store only the permutation where the internal id of t1 &lt; that of t2. insert into tag_coincidence (tc_t1, tc_t2, tc_count) select &quot;t1&quot;, &quot;t2&quot;, cnt from (select &quot;t1&quot;, &quot;t2&quot;, count (*) as cnt from (sparql define output:valmode &quot;LONG&quot; select ?t1 ?t2 where { ?s sioc:topic ?t1 . ?s sioc:topic ?t2 }) tags where &quot;t1&quot; &lt; &quot;t2&quot; group by &quot;t1&quot;, &quot;t2&quot;) xx where isiri_id (&quot;t1&quot;) and isiri_id (&quot;t2&quot;) option (quietcast); Now put the individual occurrence counts into the same table with the co-occurrence. This denormalization makes the related tags lookup faster. update tag_coincidence set tc_t1_count = (select tcn_count from tag_count where tcn_tag = tc_t1), tc_t2_count = (select tcn_count from tag_count where tcn_tag = tc_t2); Now each tag_coincidence row has the joint occurrence count and individual occurrence counts. A single select will return a Technorati-style related tags listing. To show the URI&#39;s of the tags: select top 10 id_to_iri (tc_T1), id_to_iri (tc_t2), tc_count from tag_coincidence order by tc_count desc; Social Networks We look at what interests people have sparql select ?o ?cnt where { { select ?o count (*) as ?cnt where { ?s foaf:interest ?o } group by ?o } filter (?cnt &gt; 100) } order by desc 2 limit 100 ; Now the same for the Harry Potter fans sparql select ?i2 count (*) where { ?p foaf:interest &lt;http://www.livejournal.com/interests.bml?int=harry+potter&gt; . ?p foaf:interest ?i2 } group by ?i2 order by desc 2 limit 20 ; We see whether knows relations are symmmetrical. We return the top n people that others claim to know without being reciprocally known. sparql select ?celeb, count (*) where { ?claimant foaf:knows ?celeb . filter (!bif:exists ((select (1) where { ?celeb foaf:knows ?claimant }))) } group by ?celeb order by desc 2 limit 10 ; We look for a well connected person to start from. sparql select ?p count (*) where { ?p foaf:knows ?k } group by ?p order by desc 2 limit 50 ; We look for the most connected of the many online identities of Stefan Decker. sparql select ?sd count (distinct ?xx) where { ?sd a foaf:Person . ?sd ?name ?ns . filter (bif:contains (?ns, &quot;&#39;Stefan Decker&#39;&quot;)) . ?sd foaf:knows ?xx } group by ?sd order by desc 2 ; We count the transitive closure of Stefan Decker&#39;s connections sparql select count (*) where { { select * where { ?s foaf:knows ?o } } option (transitive, t_distinct, t_in(?s), t_out(?o)) . filter (?s = &lt;mailto:stefan.decker@deri.org&gt;) } ; Now we do the same while following owl:sameAs links. sparql define input:same-as &quot;yes&quot; select count (*) where { { select * where { ?s foaf:knows ?o } } option (transitive, t_distinct, t_in(?s), t_out(?o)) . filter (?s = &lt;mailto:stefan.decker@deri.org&gt;) } ; Demo System The system runs on Virtuoso 6 Cluster Edition. The database is partitioned into 12 partitions, each served by a distinct server process. The system demonstrated hosts these 12 servers on 2 machines, each with 2 xXeon 5345 and 16GB memory and 4 SATA disks. For scaling, the processes and corresponding partitions can be spread over a larger number of machines. If each ran on its own server with 16GB RAM, the whole data set could be served from memory. This is desirable for search engine or fast analytics applications. Most of the demonstrated queries run in memory on second invocation. The timing difference between first and second run is easily an order of magnitude.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<h2>Introduction</h2> 

<p>We use <a href="http://virtuoso.openlinksw.com" id="link-id0xb03e418">Virtuoso</a> 6 Cluster Edition to demonstrate the following:</p>
<ul>
<li>Text and structured <a href="http://dbpedia.org/resource/Information" id="link-id0xbd9dae8">information</a> based lookups</li>
<li>Analytics queries</li>
<li>Analysis of co-occurrence of features like interests and tags.</li>
<li>Dealing with identity of multiple IRI&#39;s (<a href="http://dbpedia.org/resource/Web_Ontology_Language" id="link-id0xb383dd8">owl</a>:sameAs)</li>
</ul>

<p>The demo is based on a set of canned <a href="http://dbpedia.org/resource/SPARQL" id="link-id0xbda6298">SPARQL</a> queries that can be invoked using the <a href="http://ode.openlinksw.com/" id="link-id0xbb292f0">OpenLink Data Explorer</a> (<a href="http://ode.openlinksw.com/" id="link-id0xc263528">ODE</a>) Firefox extension.</p>
<p>The demo queries can also be run directly against the SPARQL end point.</p>

<p>The demo is being worked on at the time of submission and may be shown online by appointment.</p>

<p>Automatic annotation of the <a href="http://dbpedia.org/resource/Data" id="link-id0xa173378">data</a> based on <a href="http://dbpedia.org/resource/Named_entity_recognition" id="link-id0xbdda558">named entity extraction</a> is
being worked on at the time of this submission.  By the time of ISWC
2008 the set of sample queries will be enhanced with queries based on
extracted <a href="http://dbpedia.org/resource/Named_entity_recognition" id="link-id0xa66fbe0">named entities</a> and their relationships in the <a href="http://umbel.org/about/" id="link-id0xa06e2c8">UMBEL</a> and Open
CYC ontologies.
</p>

<p>Also examples involving owl:sameAs are being added, likewise  with similarity metrics and search hit scores.</p>

<h2>The Data</h2>

<p>The database consists of the billion triples data sets and some additions like Umbel.   Also the Freebase extract is newer than the challenge original.</p>
<p>The triple count is 1115 million.</p>
<p>In the case of web harvested resources, the data is loaded in one graph per resource.</p>
<p>In the case of larger data sets like <a href="http://dbpedia.org/resource/DBpedia" id="link-id0xc2bf770">Dbpedia</a> or the US census, all triples of the provenance share a data set specific graph.</p>
<p>All string literals are additionally indexed in a full text index.  No stop words are used.</p>

<p>Most queries do not specify a graph.  Thus they are evaluated against the union of all the graphs in the database.
The indexing scheme is SPOG, GPOS, POGS, OPGS.  All indices ending in S are bitmap indices.
</p>

<h2>The Queries </h2>


<p>The demo uses Virtuoso SPARQL extensions  in most queries.  These
extensions consist on one hand of well known <a href="http://dbpedia.org/resource/SQL" id="link-id0xaf8cb40">SQL</a> features like
aggregation with grouping and existence and value subqueries and on
the other of <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0xafdceb8">RDF</a> specific features.
The latter include  run time RDFS and OWL inferencing support  and backward
chaining subclasses and transitivity.  
</p>


<h3>Simple Lookups</h3> 

<pre>sparql 
select ?s ?p (bif:search_excerpt (bif:vector (&#39;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0xbb64dd0">semantic&#39;, &#39;web</a>&#39;), ?o)) 
where 
  {
    ?s ?p ?o . 
    filter (bif:contains (?o, &quot;&#39;semantic web&#39;&quot;)) 
  } 
limit 10
;
</pre>

<p>This looks up triples with semantic web in the object and makes a search hit summary of the literal, 
highlighting the search terms.
</p>

<pre>sparql 
select ?tp count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 a ?tp . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?tp
order by desc 2
limit 40
;
</pre>

<p>This looks at what sorts of things are referenced by the properties of the foaf handle plaid_skirt.</p>
<p>What are these things called?</p>

<pre>sparql 
select ?lbl count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 rdfs:label ?lbl . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?lbl
order by desc 2
;
</pre>

<p>Many of these things do not have a rdfs:label.  Let us use a more general concept of lable 
which groups dc:title, foaf:name and other name-like properties together.  The subproperties are 
resolved at run time, there is no materialization.
</p>

<pre>sparql 
define input:inference &#39;b3s&#39;
select ?lbl count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 b3s:label ?lbl . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?lbl
order by desc 2
;
</pre>

<p>We can list sources by the topics they contain.  
Below we look for graphs that mention terrorist bombing.
</p>

<pre>sparql 
select ?g count(*) 
where 
  { 
    graph ?g 
      {
        ?s ?p ?o . 
        filter (bif:contains (?o, &quot;&#39;terrorist bombing&#39;&quot;)) 
      }
  } 
group by ?g 
order by desc 2
;
</pre>

<p>Now some web 2.0 tagging of search results.  The <a href="http://dbpedia.org/resource/Tag" id="link-id0xa8b89f8">tag</a> cloud of &quot;computer&quot;</p>

<pre>sparql 
select ?lbl count (*) 
where 
  { 
    ?s ?p ?o . 
    ?o bif:contains &quot;computer&quot; . 
    ?s sioc:topic ?tg .
    optional 
      {
        ?tg rdfs:label ?lbl
      }
  }
group by ?lbl 
order by desc 2 
limit 40
;
</pre>

<p>This query will find the posters who talk the most about sex.</p>

<pre>sparql 
select ?auth count (*) 
where 
  { 
    ?d dc:creator ?auth .
    ?d ?p ?o
    filter (bif:contains (?o, &quot;sex&quot;)) 
  } 
group by ?auth
order by desc 2
;
</pre>

<h3>Analytics </h3>

<p>We look for people who are joined by having relatively uncommon interests but do not know each other.</p>

<pre>sparql select ?i ?cnt ?n1 ?n2 ?p1 ?p2 
where 
  {
    {
      select ?i count (*) as ?cnt 
      where 
        { ?p foaf:interest ?i } 
      group by ?i
    }
    filter ( ?cnt &gt; 1 &amp;&amp; ?cnt &lt; 10) .
    ?p1 foaf:interest ?i .
    ?p2 foaf:interest ?i .
    filter  (?p1 != ?p2 &amp;&amp; 
             !bif:exists ((select (1) where {?p1 foaf:knows ?p2 })) &amp;&amp; 
             !bif:exists ((select (1) where {?p2 foaf:knows ?p1 }))) .
    ?p1 foaf:nick ?n1 .
    ?p2 foaf:nick ?n2 .
  } 
order by ?cnt 
limit 50
;
</pre>

<p>The query takes a fairly long time, mostly spent counting the interested in 25M interest triples.  
It then takes people that share the interest and checks that neither claims to know the other.  
It then sorts the results rarest interest first.  The query can be written more efficently but is 
here just to show that database-wide scans of the population are possible ad hoc.
</p>

<p>Now we go to SQL to make a tag co-occurrence matrix. This can be used for showing a Technorati-style
related tags line at the bottom of a search result page.  This showcases the use of SQL together 
with SPARQL.  The half-matrix of tags t1, t2 with the co-occurrence count at the intersection is 
much more efficiently done in SQL, specially since it gets updated as the data changes.  
This is an example of materialized intermediate results based on warehoused RDF.
</p>

<pre>create table 
tag_count (tcn_tag iri_id_8, 
           tcn_count int, 
           primary key (tcn_tag));
           
alter index 
tag_count on tag_count partition (tcn_tag int (0hexffff00));

create table 
tag_coincidence (tc_t1 iri_id_8, 
                 tc_t2 iri_id_8, 
                 tc_count int, 
                 tc_t1_count int, 
                 tc_t2_count int, 
                 primary key  (tc_t1, tc_t2))

alter index 
tag_coincidence on tag_coincidence partition (tc_t1 int (0hexffff00));

create index 
tc2 on tag_coincidence (tc_t2, tc_t1) partition (tc_t2 int (0hexffff00));
</pre>

<p>How many times each topic is mentioned?</p>

<pre>
insert into tag_count 
  select * 
    from (sparql define output:valmode &quot;LONG&quot; 
                 select ?t count (*) as ?cnt 
                 where 
                   {
                     ?s sioc:topic ?t
                   } 
                 group by ?t) 
    xx option (quietcast);
</pre>

<p>Take all t1, t2 where t1 and t2 are tags of the same subject, store only the permutation where the internal id of t1 &lt; that of t2.</p>

<pre>insert into tag_coincidence  (tc_t1, tc_t2, tc_count)
  select &quot;t1&quot;, &quot;t2&quot;, cnt 
    from 
      (select  &quot;t1&quot;, &quot;t2&quot;, count (*) as cnt 
         from 
           (sparql define output:valmode &quot;LONG&quot;
                   select ?t1 ?t2 
                     where 
                       {
                         ?s sioc:topic ?t1 . 
                         ?s sioc:topic ?t2 
                       }) tags
         where &quot;t1&quot; &lt; &quot;t2&quot; 
         group by &quot;t1&quot;, &quot;t2&quot;) xx
    where isiri_id (&quot;t1&quot;) and 
          isiri_id (&quot;t2&quot;) 
    option (quietcast); 
</pre>

<p>Now put the individual occurrence counts into the same table with the co-occurrence.  This 
denormalization makes the related tags lookup faster.
</p>


<pre>update tag_coincidence 
  set tc_t1_count = (select tcn_count from tag_count where tcn_tag = tc_t1),
      tc_t2_count = (select tcn_count from tag_count where tcn_tag = tc_t2);
</pre>

<p>Now each tag_coincidence row has the joint occurrence count and individual occurrence counts.  
A single select will return a Technorati-style related tags listing.
</p>

<p>To show the <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0x9d4bc60">URI</a>&#39;s of the tags:
</p>

<pre>select top 10 id_to_iri (tc_T1), id_to_iri (tc_t2), tc_count 
  from tag_coincidence 
  order by tc_count desc;
</pre>

<h3>Social Networks </h3>

<p>We look at what interests people have </p>

<pre>sparql 
select ?o ?cnt  
where 
  {
    {
      select ?o count (*) as ?cnt 
        where 
          {
            ?s foaf:interest ?o
          } 
        group by ?o
    } 
    filter (?cnt &gt; 100) 
  } 
order by desc 2 
limit 100
;
</pre>

<p>Now the same for the Harry Potter fans </p>

<pre>sparql 
select ?i2 count (*) 
where 
  { 
    ?p foaf:interest &lt;<a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0xba0b390">http</a>://www.livejournal.com/interests.bml?int=harry+potter&gt; .
    ?p foaf:interest ?i2 
  } 
group by ?i2 
order by desc 2 
limit 20
;
</pre>

<p>We see whether knows relations are symmmetrical.  We return the top n people that others claim to know without being reciprocally known.</p>

<pre>sparql 
select ?celeb, count (*) 
where 
  { 
    ?claimant foaf:knows ?celeb . 
    filter (!bif:exists ((select (1) 
                          where 
                            {
                              ?celeb foaf:knows ?claimant 
                            }))) 
  } 
group by ?celeb 
order by desc 2 
limit 10
;
</pre>

<p>We look for a well connected person to start from.</p>

<pre>sparql 
select ?p count (*) 
where 
  {
    ?p foaf:knows ?k 
  } 
group by ?p 
order by desc 2 
limit 50
;
</pre>

<p>We look for the most connected of the many online identities of Stefan Decker.</p>

<pre>sparql 
select ?sd count (distinct ?xx) 
where 
  { 
    ?sd a foaf:Person . 
    ?sd ?name ?ns . 
    filter (bif:contains (?ns, &quot;&#39;Stefan Decker&#39;&quot;)) . 
    ?sd foaf:knows ?xx 
  } 
group by ?sd 
order by desc 2
;
</pre>

<p>We count the transitive closure of Stefan Decker&#39;s connections </p>

<pre>sparql 
select count (*) 
where 
  { 
    {
      select * 
      where 
        { 
          ?s foaf:knows ?o 
        }
    }
    option (transitive, t_distinct, t_in(?s), t_out(?o)) . 
    filter (?s = &lt;mailto:stefan.decker@deri.org&gt;)
  }
;
</pre>

<p>Now we do the same while following owl:sameAs links.</p>

<pre>sparql 
define input:same-as &quot;yes&quot;
select count (*) 
where 
  { 
    {
      select * 
      where 
        { 
          ?s foaf:knows ?o 
        }
    }
    option (transitive, t_distinct, t_in(?s), t_out(?o)) . 
    filter (?s = &lt;mailto:stefan.decker@deri.org&gt;)
  }
;
</pre>

<h2>Demo System</h2> 

<p>The system runs on Virtuoso 6 Cluster Edition.  The database is partitioned into 12 partitions, 
each served by a distinct server process. The system demonstrated hosts these 12 servers on 2 
machines, each with  2 xXeon 5345 and 16GB memory and 4 SATA disks. For scaling, the processes 
and corresponding partitions can be spread over a larger number of machines.  If each ran on its 
own server with 16GB RAM, the whole data set could be served from memory. This is desirable for 
search engine or fast analytics applications. Most of the demonstrated queries run in memory on 
second invocation. The timing difference between first and second run is easily an order of 
magnitude.
</p>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-09-30#1445">
  <rss:title>OpenLink Software&#39;s Virtuoso Submission to the Billion Triples Challenge</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-09-30T15:39:26Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Introduction We use Virtuoso 6 Cluster Edition to demonstrate the following: Text and structured information based lookups Analytics queries Analysis of co-occurrence of features like interests and tags. Dealing with identity of multiple IRI&#39;s (owl:sameAs) The demo is based on a set of canned SPARQL queries that can be invoked using the OpenLink Data Explorer (ODE) Firefox extension. The demo queries can also be run directly against the SPARQL end point. The demo is being worked on at the time of submission and may be shown online by appointment. Automatic annotation of the data based on named entity extraction is being worked on at the time of this submission. By the time of ISWC 2008 the set of sample queries will be enhanced with queries based on extracted named entities and their relationships in the UMBEL and Open CYC ontologies. Also examples involving owl:sameAs are being added, likewise with similarity metrics and search hit scores. The Data The database consists of the billion triples data sets and some additions like Umbel. Also the Freebase extract is newer than the challenge original. The triple count is 1115 million. In the case of web harvested resources, the data is loaded in one graph per resource. In the case of larger data sets like Dbpedia or the US census, all triples of the provenance share a data set specific graph. All string literals are additionally indexed in a full text index. No stop words are used. Most queries do not specify a graph. Thus they are evaluated against the union of all the graphs in the database. The indexing scheme is SPOG, GPOS, POGS, OPGS. All indices ending in S are bitmap indices. The Queries The demo uses Virtuoso SPARQL extensions in most queries. These extensions consist on one hand of well known SQL features like aggregation with grouping and existence and value subqueries and on the other of RDF specific features. The latter include run time RDFS and OWL inferencing support and backward chaining subclasses and transitivity. Simple Lookups sparql select ?s ?p (bif:search_excerpt (bif:vector (&#39;semantic&#39;, &#39;web&#39;), ?o)) where { ?s ?p ?o . filter (bif:contains (?o, &quot;&#39;semantic web&#39;&quot;)) } limit 10 ; This looks up triples with semantic web in the object and makes a search hit summary of the literal, highlighting the search terms. sparql select ?tp count(*) where { ?s ?p2 ?o2 . ?o2 a ?tp . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?tp order by desc 2 limit 40 ; This looks at what sorts of things are referenced by the properties of the foaf handle plaid_skirt. What are these things called? sparql select ?lbl count(*) where { ?s ?p2 ?o2 . ?o2 rdfs:label ?lbl . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?lbl order by desc 2 ; Many of these things do not have a rdfs:label. Let us use a more general concept of lable which groups dc:title, foaf:name and other name-like properties together. The subproperties are resolved at run time, there is no materialization. sparql define input:inference &#39;b3s&#39; select ?lbl count(*) where { ?s ?p2 ?o2 . ?o2 b3s:label ?lbl . ?s foaf:nick ?o . filter (bif:contains (?o, &quot;plaid_skirt&quot;)) } group by ?lbl order by desc 2 ; We can list sources by the topics they contain. Below we look for graphs that mention terrorist bombing. sparql select ?g count(*) where { graph ?g { ?s ?p ?o . filter (bif:contains (?o, &quot;&#39;terrorist bombing&#39;&quot;)) } } group by ?g order by desc 2 ; Now some web 2.0 tagging of search results. The tag cloud of &quot;computer&quot; sparql select ?lbl count (*) where { ?s ?p ?o . ?o bif:contains &quot;computer&quot; . ?s sioc:topic ?tg . optional { ?tg rdfs:label ?lbl } } group by ?lbl order by desc 2 limit 40 ; This query will find the posters who talk the most about sex. sparql select ?auth count (*) where { ?d dc:creator ?auth . ?d ?p ?o filter (bif:contains (?o, &quot;sex&quot;)) } group by ?auth order by desc 2 ; Analytics We look for people who are joined by having relatively uncommon interests but do not know each other. sparql select ?i ?cnt ?n1 ?n2 ?p1 ?p2 where { { select ?i count (*) as ?cnt where { ?p foaf:interest ?i } group by ?i } filter ( ?cnt &gt; 1 &amp;&amp; ?cnt &lt; 10) . ?p1 foaf:interest ?i . ?p2 foaf:interest ?i . filter (?p1 != ?p2 &amp;&amp; !bif:exists ((select (1) where {?p1 foaf:knows ?p2 })) &amp;&amp; !bif:exists ((select (1) where {?p2 foaf:knows ?p1 }))) . ?p1 foaf:nick ?n1 . ?p2 foaf:nick ?n2 . } order by ?cnt limit 50 ; The query takes a fairly long time, mostly spent counting the interested in 25M interest triples. It then takes people that share the interest and checks that neither claims to know the other. It then sorts the results rarest interest first. The query can be written more efficently but is here just to show that database-wide scans of the population are possible ad hoc. Now we go to SQL to make a tag co-occurrence matrix. This can be used for showing a Technorati-style related tags line at the bottom of a search result page. This showcases the use of SQL together with SPARQL. The half-matrix of tags t1, t2 with the co-occurrence count at the intersection is much more efficiently done in SQL, specially since it gets updated as the data changes. This is an example of materialized intermediate results based on warehoused RDF. create table tag_count (tcn_tag iri_id_8, tcn_count int, primary key (tcn_tag)); alter index tag_count on tag_count partition (tcn_tag int (0hexffff00)); create table tag_coincidence (tc_t1 iri_id_8, tc_t2 iri_id_8, tc_count int, tc_t1_count int, tc_t2_count int, primary key (tc_t1, tc_t2)) alter index tag_coincidence on tag_coincidence partition (tc_t1 int (0hexffff00)); create index tc2 on tag_coincidence (tc_t2, tc_t1) partition (tc_t2 int (0hexffff00)); How many times each topic is mentioned? insert into tag_count select * from (sparql define output:valmode &quot;LONG&quot; select ?t count (*) as ?cnt where { ?s sioc:topic ?t } group by ?t) xx option (quietcast); Take all t1, t2 where t1 and t2 are tags of the same subject, store only the permutation where the internal id of t1 &lt; that of t2. insert into tag_coincidence (tc_t1, tc_t2, tc_count) select &quot;t1&quot;, &quot;t2&quot;, cnt from (select &quot;t1&quot;, &quot;t2&quot;, count (*) as cnt from (sparql define output:valmode &quot;LONG&quot; select ?t1 ?t2 where { ?s sioc:topic ?t1 . ?s sioc:topic ?t2 }) tags where &quot;t1&quot; &lt; &quot;t2&quot; group by &quot;t1&quot;, &quot;t2&quot;) xx where isiri_id (&quot;t1&quot;) and isiri_id (&quot;t2&quot;) option (quietcast); Now put the individual occurrence counts into the same table with the co-occurrence. This denormalization makes the related tags lookup faster. update tag_coincidence set tc_t1_count = (select tcn_count from tag_count where tcn_tag = tc_t1), tc_t2_count = (select tcn_count from tag_count where tcn_tag = tc_t2); Now each tag_coincidence row has the joint occurrence count and individual occurrence counts. A single select will return a Technorati-style related tags listing. To show the URI&#39;s of the tags: select top 10 id_to_iri (tc_T1), id_to_iri (tc_t2), tc_count from tag_coincidence order by tc_count desc; Social Networks We look at what interests people have sparql select ?o ?cnt where { { select ?o count (*) as ?cnt where { ?s foaf:interest ?o } group by ?o } filter (?cnt &gt; 100) } order by desc 2 limit 100 ; Now the same for the Harry Potter fans sparql select ?i2 count (*) where { ?p foaf:interest &lt;http://www.livejournal.com/interests.bml?int=harry+potter&gt; . ?p foaf:interest ?i2 } group by ?i2 order by desc 2 limit 20 ; We see whether knows relations are symmmetrical. We return the top n people that others claim to know without being reciprocally known. sparql select ?celeb, count (*) where { ?claimant foaf:knows ?celeb . filter (!bif:exists ((select (1) where { ?celeb foaf:knows ?claimant }))) } group by ?celeb order by desc 2 limit 10 ; We look for a well connected person to start from. sparql select ?p count (*) where { ?p foaf:knows ?k } group by ?p order by desc 2 limit 50 ; We look for the most connected of the many online identities of Stefan Decker. sparql select ?sd count (distinct ?xx) where { ?sd a foaf:Person . ?sd ?name ?ns . filter (bif:contains (?ns, &quot;&#39;Stefan Decker&#39;&quot;)) . ?sd foaf:knows ?xx } group by ?sd order by desc 2 ; We count the transitive closure of Stefan Decker&#39;s connections sparql select count (*) where { { select * where { ?s foaf:knows ?o } } option (transitive, t_distinct, t_in(?s), t_out(?o)) . filter (?s = &lt;mailto:stefan.decker@deri.org&gt;) } ; Now we do the same while following owl:sameAs links. sparql define input:same-as &quot;yes&quot; select count (*) where { { select * where { ?s foaf:knows ?o } } option (transitive, t_distinct, t_in(?s), t_out(?o)) . filter (?s = &lt;mailto:stefan.decker@deri.org&gt;) } ; Demo System The system runs on Virtuoso 6 Cluster Edition. The database is partitioned into 12 partitions, each served by a distinct server process. The system demonstrated hosts these 12 servers on 2 machines, each with 2 xXeon 5345 and 16GB memory and 4 SATA disks. For scaling, the processes and corresponding partitions can be spread over a larger number of machines. If each ran on its own server with 16GB RAM, the whole data set could be served from memory. This is desirable for search engine or fast analytics applications. Most of the demonstrated queries run in memory on second invocation. The timing difference between first and second run is easily an order of magnitude.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<h2>Introduction</h2> 

<p>We use <a href="http://virtuoso.openlinksw.com" id="link-id0xa278560">Virtuoso</a> 6 Cluster Edition to demonstrate the following:</p>
<ul>
<li>Text and structured <a href="http://dbpedia.org/resource/Information" id="link-id0xb3a4490">information</a> based lookups</li>
<li>Analytics queries</li>
<li>Analysis of co-occurrence of features like interests and tags.</li>
<li>Dealing with identity of multiple IRI&#39;s (<a href="http://dbpedia.org/resource/Web_Ontology_Language" id="link-id0xa904bd8">owl</a>:sameAs)</li>
</ul>

<p>The demo is based on a set of canned <a href="http://dbpedia.org/resource/SPARQL" id="link-id0xac185d0">SPARQL</a> queries that can be invoked using the <a href="http://ode.openlinksw.com/" id="link-id0xb8efe28">OpenLink Data Explorer</a> (<a href="http://ode.openlinksw.com/" id="link-id0xb341808">ODE</a>) Firefox extension.</p>
<p>The demo queries can also be run directly against the SPARQL end point.</p>

<p>The demo is being worked on at the time of submission and may be shown online by appointment.</p>

<p>Automatic annotation of the <a href="http://dbpedia.org/resource/Data" id="link-id0xa2fcc88">data</a> based on <a href="http://dbpedia.org/resource/Named_entity_recognition" id="link-id0xc085440">named entity extraction</a> is
being worked on at the time of this submission.  By the time of ISWC
2008 the set of sample queries will be enhanced with queries based on
extracted <a href="http://dbpedia.org/resource/Named_entity_recognition" id="link-id0xa92b3e0">named entities</a> and their relationships in the <a href="http://umbel.org/about/" id="link-id0xa1c7c38">UMBEL</a> and Open
CYC ontologies.
</p>

<p>Also examples involving owl:sameAs are being added, likewise  with similarity metrics and search hit scores.</p>

<h2>The Data</h2>

<p>The database consists of the billion triples data sets and some additions like Umbel.   Also the Freebase extract is newer than the challenge original.</p>
<p>The triple count is 1115 million.</p>
<p>In the case of web harvested resources, the data is loaded in one graph per resource.</p>
<p>In the case of larger data sets like <a href="http://dbpedia.org/resource/DBpedia" id="link-id0xa949850">Dbpedia</a> or the US census, all triples of the provenance share a data set specific graph.</p>
<p>All string literals are additionally indexed in a full text index.  No stop words are used.</p>

<p>Most queries do not specify a graph.  Thus they are evaluated against the union of all the graphs in the database.
The indexing scheme is SPOG, GPOS, POGS, OPGS.  All indices ending in S are bitmap indices.
</p>

<h2>The Queries </h2>


<p>The demo uses Virtuoso SPARQL extensions  in most queries.  These
extensions consist on one hand of well known <a href="http://dbpedia.org/resource/SQL" id="link-id0xc116190">SQL</a> features like
aggregation with grouping and existence and value subqueries and on
the other of <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0xa9047f0">RDF</a> specific features.
The latter include  run time RDFS and OWL inferencing support  and backward
chaining subclasses and transitivity.  
</p>


<h3>Simple Lookups</h3> 

<pre>sparql 
select ?s ?p (bif:search_excerpt (bif:vector (&#39;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0xbe38070">semantic&#39;, &#39;web</a>&#39;), ?o)) 
where 
  {
    ?s ?p ?o . 
    filter (bif:contains (?o, &quot;&#39;semantic web&#39;&quot;)) 
  } 
limit 10
;
</pre>

<p>This looks up triples with semantic web in the object and makes a search hit summary of the literal, 
highlighting the search terms.
</p>

<pre>sparql 
select ?tp count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 a ?tp . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?tp
order by desc 2
limit 40
;
</pre>

<p>This looks at what sorts of things are referenced by the properties of the foaf handle plaid_skirt.</p>
<p>What are these things called?</p>

<pre>sparql 
select ?lbl count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 rdfs:label ?lbl . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?lbl
order by desc 2
;
</pre>

<p>Many of these things do not have a rdfs:label.  Let us use a more general concept of lable 
which groups dc:title, foaf:name and other name-like properties together.  The subproperties are 
resolved at run time, there is no materialization.
</p>

<pre>sparql 
define input:inference &#39;b3s&#39;
select ?lbl count(*) 
where 
  { 
    ?s ?p2 ?o2 . 
    ?o2 b3s:label ?lbl . 
    ?s foaf:nick ?o . 
    filter (bif:contains (?o, &quot;plaid_skirt&quot;)) 
  } 
group by ?lbl
order by desc 2
;
</pre>

<p>We can list sources by the topics they contain.  
Below we look for graphs that mention terrorist bombing.
</p>

<pre>sparql 
select ?g count(*) 
where 
  { 
    graph ?g 
      {
        ?s ?p ?o . 
        filter (bif:contains (?o, &quot;&#39;terrorist bombing&#39;&quot;)) 
      }
  } 
group by ?g 
order by desc 2
;
</pre>

<p>Now some web 2.0 tagging of search results.  The <a href="http://dbpedia.org/resource/Tag" id="link-id0xa366510">tag</a> cloud of &quot;computer&quot;</p>

<pre>sparql 
select ?lbl count (*) 
where 
  { 
    ?s ?p ?o . 
    ?o bif:contains &quot;computer&quot; . 
    ?s sioc:topic ?tg .
    optional 
      {
        ?tg rdfs:label ?lbl
      }
  }
group by ?lbl 
order by desc 2 
limit 40
;
</pre>

<p>This query will find the posters who talk the most about sex.</p>

<pre>sparql 
select ?auth count (*) 
where 
  { 
    ?d dc:creator ?auth .
    ?d ?p ?o
    filter (bif:contains (?o, &quot;sex&quot;)) 
  } 
group by ?auth
order by desc 2
;
</pre>

<h3>Analytics </h3>

<p>We look for people who are joined by having relatively uncommon interests but do not know each other.</p>

<pre>sparql select ?i ?cnt ?n1 ?n2 ?p1 ?p2 
where 
  {
    {
      select ?i count (*) as ?cnt 
      where 
        { ?p foaf:interest ?i } 
      group by ?i
    }
    filter ( ?cnt &gt; 1 &amp;&amp; ?cnt &lt; 10) .
    ?p1 foaf:interest ?i .
    ?p2 foaf:interest ?i .
    filter  (?p1 != ?p2 &amp;&amp; 
             !bif:exists ((select (1) where {?p1 foaf:knows ?p2 })) &amp;&amp; 
             !bif:exists ((select (1) where {?p2 foaf:knows ?p1 }))) .
    ?p1 foaf:nick ?n1 .
    ?p2 foaf:nick ?n2 .
  } 
order by ?cnt 
limit 50
;
</pre>

<p>The query takes a fairly long time, mostly spent counting the interested in 25M interest triples.  
It then takes people that share the interest and checks that neither claims to know the other.  
It then sorts the results rarest interest first.  The query can be written more efficently but is 
here just to show that database-wide scans of the population are possible ad hoc.
</p>

<p>Now we go to SQL to make a tag co-occurrence matrix. This can be used for showing a Technorati-style
related tags line at the bottom of a search result page.  This showcases the use of SQL together 
with SPARQL.  The half-matrix of tags t1, t2 with the co-occurrence count at the intersection is 
much more efficiently done in SQL, specially since it gets updated as the data changes.  
This is an example of materialized intermediate results based on warehoused RDF.
</p>

<pre>create table 
tag_count (tcn_tag iri_id_8, 
           tcn_count int, 
           primary key (tcn_tag));
           
alter index 
tag_count on tag_count partition (tcn_tag int (0hexffff00));

create table 
tag_coincidence (tc_t1 iri_id_8, 
                 tc_t2 iri_id_8, 
                 tc_count int, 
                 tc_t1_count int, 
                 tc_t2_count int, 
                 primary key  (tc_t1, tc_t2))

alter index 
tag_coincidence on tag_coincidence partition (tc_t1 int (0hexffff00));

create index 
tc2 on tag_coincidence (tc_t2, tc_t1) partition (tc_t2 int (0hexffff00));
</pre>

<p>How many times each topic is mentioned?</p>

<pre>
insert into tag_count 
  select * 
    from (sparql define output:valmode &quot;LONG&quot; 
                 select ?t count (*) as ?cnt 
                 where 
                   {
                     ?s sioc:topic ?t
                   } 
                 group by ?t) 
    xx option (quietcast);
</pre>

<p>Take all t1, t2 where t1 and t2 are tags of the same subject, store only the permutation where the internal id of t1 &lt; that of t2.</p>

<pre>insert into tag_coincidence  (tc_t1, tc_t2, tc_count)
  select &quot;t1&quot;, &quot;t2&quot;, cnt 
    from 
      (select  &quot;t1&quot;, &quot;t2&quot;, count (*) as cnt 
         from 
           (sparql define output:valmode &quot;LONG&quot;
                   select ?t1 ?t2 
                     where 
                       {
                         ?s sioc:topic ?t1 . 
                         ?s sioc:topic ?t2 
                       }) tags
         where &quot;t1&quot; &lt; &quot;t2&quot; 
         group by &quot;t1&quot;, &quot;t2&quot;) xx
    where isiri_id (&quot;t1&quot;) and 
          isiri_id (&quot;t2&quot;) 
    option (quietcast); 
</pre>

<p>Now put the individual occurrence counts into the same table with the co-occurrence.  This 
denormalization makes the related tags lookup faster.
</p>


<pre>update tag_coincidence 
  set tc_t1_count = (select tcn_count from tag_count where tcn_tag = tc_t1),
      tc_t2_count = (select tcn_count from tag_count where tcn_tag = tc_t2);
</pre>

<p>Now each tag_coincidence row has the joint occurrence count and individual occurrence counts.  
A single select will return a Technorati-style related tags listing.
</p>

<p>To show the <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0xaf355c8">URI</a>&#39;s of the tags:
</p>

<pre>select top 10 id_to_iri (tc_T1), id_to_iri (tc_t2), tc_count 
  from tag_coincidence 
  order by tc_count desc;
</pre>

<h3>Social Networks </h3>

<p>We look at what interests people have </p>

<pre>sparql 
select ?o ?cnt  
where 
  {
    {
      select ?o count (*) as ?cnt 
        where 
          {
            ?s foaf:interest ?o
          } 
        group by ?o
    } 
    filter (?cnt &gt; 100) 
  } 
order by desc 2 
limit 100
;
</pre>

<p>Now the same for the Harry Potter fans </p>

<pre>sparql 
select ?i2 count (*) 
where 
  { 
    ?p foaf:interest &lt;<a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0xa274410">http</a>://www.livejournal.com/interests.bml?int=harry+potter&gt; .
    ?p foaf:interest ?i2 
  } 
group by ?i2 
order by desc 2 
limit 20
;
</pre>

<p>We see whether knows relations are symmmetrical.  We return the top n people that others claim to know without being reciprocally known.</p>

<pre>sparql 
select ?celeb, count (*) 
where 
  { 
    ?claimant foaf:knows ?celeb . 
    filter (!bif:exists ((select (1) 
                          where 
                            {
                              ?celeb foaf:knows ?claimant 
                            }))) 
  } 
group by ?celeb 
order by desc 2 
limit 10
;
</pre>

<p>We look for a well connected person to start from.</p>

<pre>sparql 
select ?p count (*) 
where 
  {
    ?p foaf:knows ?k 
  } 
group by ?p 
order by desc 2 
limit 50
;
</pre>

<p>We look for the most connected of the many online identities of Stefan Decker.</p>

<pre>sparql 
select ?sd count (distinct ?xx) 
where 
  { 
    ?sd a foaf:Person . 
    ?sd ?name ?ns . 
    filter (bif:contains (?ns, &quot;&#39;Stefan Decker&#39;&quot;)) . 
    ?sd foaf:knows ?xx 
  } 
group by ?sd 
order by desc 2
;
</pre>

<p>We count the transitive closure of Stefan Decker&#39;s connections </p>

<pre>sparql 
select count (*) 
where 
  { 
    {
      select * 
      where 
        { 
          ?s foaf:knows ?o 
        }
    }
    option (transitive, t_distinct, t_in(?s), t_out(?o)) . 
    filter (?s = &lt;mailto:stefan.decker@deri.org&gt;)
  }
;
</pre>

<p>Now we do the same while following owl:sameAs links.</p>

<pre>sparql 
define input:same-as &quot;yes&quot;
select count (*) 
where 
  { 
    {
      select * 
      where 
        { 
          ?s foaf:knows ?o 
        }
    }
    option (transitive, t_distinct, t_in(?s), t_out(?o)) . 
    filter (?s = &lt;mailto:stefan.decker@deri.org&gt;)
  }
;
</pre>

<h2>Demo System</h2> 

<p>The system runs on Virtuoso 6 Cluster Edition.  The database is partitioned into 12 partitions, 
each served by a distinct server process. The system demonstrated hosts these 12 servers on 2 
machines, each with  2 xXeon 5345 and 16GB memory and 4 SATA disks. For scaling, the processes 
and corresponding partitions can be spread over a larger number of machines.  If each ran on its 
own server with 16GB RAM, the whole data set could be served from memory. This is desirable for 
search engine or fast analytics applications. Most of the demonstrated queries run in memory on 
second invocation. The timing difference between first and second run is easily an order of 
magnitude.
</p>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-09-11#1437">
  <rss:title>Business Value of Linked Data (Enterprise Angle)? </rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-09-11T18:59:24Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">All enterprises run IS/MIS/EIS systems that are supposed to enable optimized exploitation of data, information, and knowledge. Unfortunately, applications, services (SOAP or REST), database engines, middleware, operating systems, programming languages, development frameworks, network protocols, network topologies, or some other piece of infrastructure, eventually lay claim (possessively) to the data. Courtesy of Linked Data, we are now able to extend the &quot;document to document&quot; linking mechanism of the Web (Hypertext Linking) to more granular &quot;entity to entity&quot; level linking. And in doing so, we have a layer of abstraction that in one swoop alleviates all of the infrastructure oriented data access impediments of yore. I know this sounds simplistic, but be rest assured, imbibing Linked Data&#39;s value proposition is really just that simple, once you engage solutions (e.g. Virtuoso) that enable you to deploy Linked Data across your enterprise. Example: Microsoft ACCESS, SQL Server, and Virtuoso all use the Northwind SQL DB Schema as the basis of the demonstration database shipped with each DBMS product. This schema is comprised of common IS/MIS entities that include: Customers, Contacts, Orders, Products, Employees etc. What we all really want to do as data, information, and knowledge consumers and/or dispatchers, is be no more than a single &quot;mouse click&quot; away from relevant data/information/knowledge data access and/or exploration. Even better (but not always so obvious), we also want anyone in our network (company, division, department, cube-cluster) to inherit these data access efficiencies. In this example, the Web Page about the Customer &quot;ALKI&quot; provides me with a myriad of exploration and data access paths e.g., when I click on the foaf:primarytopic property value link. This simple example, via a single Web Page, should put to rest any doubts about the utility of Linked Data. Of course this is an old demo, but this time around the UI is minimalist as my prior attempts skipped a few steps i.e., starting from within a Linked Data explorer/browser. Important note: I haven&#39;t exported SQL into an RDF data warehouse, I am converting the SQL into RDF Linked Data on the fly which has two fundamental benefits: No vulnerability to changes in the source DBMS Superior performance over the RDF warehouse since the source schema is SQL based and I can leverage the optimization of the underlying SQL engine when translating between SPARQL and SQL. Enjoy! Related Requirements for Relational to RDF Mapping Handling Graph Transitivity in a SQL/RDF Hybrid Engine How Virtuoso handles the Web Aspects of Linked Data Queries.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>All enterprises run IS/MIS/EIS systems that are supposed to enable optimized exploitation of <a href="http://dbpedia.org/resource/Data">data</a>, <a href="http://dbpedia.org/resource/Information" id="link-id1408bee8">information</a>, and <a href="http://dbpedia.org/resource/Knowledge" id="link-id14c429a8">knowledge</a>. Unfortunately, applications, services (SOAP or REST), database engines, middleware, operating systems, programming languages, development frameworks, network protocols, network topologies, or some other piece of infrastructure, eventually lay claim (possessively) to the data.</p>

<p>Courtesy of <a href="http://dbpedia.org/resource/Linked_Data" id="link-id10f98db8">Linked Data</a>, we are now able to extend the &quot;document to document&quot; linking mechanism of the <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> (Hypertext Linking) to more granular &quot;<a href="http://dbpedia.org/resource/Entity" id="link-id14410810">entity</a> to <a href="http://dbpedia.org/resource/Entity" id="link-id10dbb420">entity</a>&quot; level linking. And in doing so, we have a layer of abstraction that in one swoop alleviates all of the infrastructure oriented data access impediments of yore. I know this sounds simplistic, but be rest assured, imbibing <a href="http://dbpedia.org/resource/Linked_Data" id="link-id14b6af20">Linked Data</a>&#39;s value proposition is really just that simple, once you engage solutions (e.g. <a href="http://virtuoso.openlinksw.com" id="link-id14ce6a20">Virtuoso</a>) that enable you to deploy <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1151c718">Linked Data</a> across your enterprise.</p>

<h3>Example: </h3>
<p>Microsoft ACCESS, <a href="http://dbpedia.org/resource/SQL" id="link-id14ef3b08">SQL</a> Server, and <a href="http://virtuoso.openlinksw.com" id="link-id10d865b8">Virtuoso</a> all use the Northwind <a href="http://dbpedia.org/resource/SQL" id="link-id10b04250">SQL</a> DB Schema as the basis of the demonstration database shipped with each DBMS product. This schema is comprised of common IS/MIS entities that include: Customers, Contacts, Orders, Products, Employees etc.</p>

<p>What we all really want to do as data, <a href="http://dbpedia.org/resource/Information" id="link-id110dd7a0">information</a>, and <a href="http://dbpedia.org/resource/Knowledge" id="link-id11484408">knowledge</a> consumers and/or dispatchers, is be no more than a single &quot;mouse click&quot; away from relevant data/<a href="http://dbpedia.org/resource/Information" id="link-id10c755c8">information</a>/<a href="http://dbpedia.org/resource/Knowledge" id="link-id1464ac88">knowledge</a> data access and/or exploration. Even better (but not always so obvious), we also want anyone in our network (company, division, department, cube-cluster) to inherit these data access efficiencies.</p>

<p>In this example, the <a href="http://demo.openlinksw.com/Northwind/Customer/ALFKI" id="link-id14ab8ed0">Web Page about the Customer &quot;ALKI&quot;</a> provides <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id14bdb360">me</a> with a myriad of exploration and data access paths e.g., when I click on the <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id10c388e0">foaf</a>:primarytopic property value link.</p>

<p>This simple example, via a single Web Page, should put to rest any doubts about the utility of <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xb042fd8">Linked Data</a>. Of course this is <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=alfki&type=text&output=html" id="link-id10ccccf0">an old demo</a>, but this time around the UI is minimalist as my prior attempts skipped a few steps i.e., starting from within a <a href="http://ode.openlinksw.com" id="link-id10f8a530">Linked Data explorer/browser</a>.</p>

<p>Important note: I haven&#39;t exported <a href="http://dbpedia.org/resource/SQL" id="link-id0x16dfc2a0">SQL</a> into an RDF data warehouse, I am converting the SQL into RDF <a href="http://dbpedia.org/resource/Linked_Data">Linked Data</a> on the fly which has two fundamental benefits:</p>
<ol>
<li>No vulnerability to changes in the source DBMS</li>
<li>Superior performance over the RDF warehouse since the source schema is SQL based and I can leverage the optimization of the underlying SQL engine when translating between <a href="http://dbpedia.org/resource/SPARQL" id="link-id0xd9a4030">SPARQL</a> and SQL.</li>
</ol>


<p>Enjoy!</p>
<h3>Related</h3>
<ol>
<li>
  <a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1434" id="link-id11338a48">Requirements for Relational to RDF Mapping</a>
</li>
<li>
  <a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1433" id="link-id10d84278">Handling Graph Transitivity in a SQL/RDF Hybrid Engine</a>
</li>
<li>
  <a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1431" id="link-id10c762e8">How Virtuoso handles the Web Aspects of Linked Data Queries</a>.</li>
</ol>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-08-15#1413">
  <rss:title>Response to: Whole Data Post (Update 3)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-08-15T13:06:12Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">This post is in response to Glenn McDonald&#39;s post titled: Whole Data, where he highlights a number of issues relating to &quot;Semantic Web&quot; marketing communications and overall messaging, from his perspective. By coincidence, Glenn and I presented at this month&#39;s Cambridge Semantic Web Gathering. I&#39;ve provided a dump of Glenn&#39;s issues and my responses below: Issue - RDF Ingenious data decomposition idea, but: too low-level; the assembly language of data, where we need Java or Ruby &quot;resource&quot; is not the issue; there&#39;s no such thing as &quot;metadata&quot;, it&#39;s all data; &quot;meta&quot; is a perspective lists need to be effortless, not painful and obscure nodes need to be represented, not just implied; they need types and literals in a more pervasive, integrated way. Response: RDF is a Graph based Data Model it stands for Resource Description Framework. The Metadata data angle comes from it&#39;s Meta Content Framework (MCF) origins. You can express and serialize data based on the RDF Data Model using: Turtle, N3, TriX, N-Triples, and RDF/XML. Issue - SPARQL (and Freebase&#39;s MQL) These are just appeasement: - old query paradigm: fishing in dark water with superstitiously tied lures; only works well in carefully stocked lakes - we don&#39;t ask questions by defining answer shapes and then hoping they&#39;re dredged up whole. Response: SPARQL, MQL, and Entity-SQL are Graph Model oriented Query Languages. Query Languages always accompany Database Engines. SQL is the Relational Model equivalent. Issue - Linked Data Noble attempt to ground the abstract, but: - URI dereferencing/namespace/open-world issues focus too much technical attention on cross-source cases where the human issues dwarf the technical ones anyway - FOAF query over the people in this room? forget it. - link asymmetry doesn&#39;t scale - identity doesn&#39;t scale - generating RDF from non-graph sources: more appeasement, right where the win from actually converting could be biggest! Response: Innovative use of HTTP to deliver &quot;Data Access by Reference&quot; to the Linked Data Web. When you have a Data Model, Database Engine, and Query Language, the next thing you need is a Data Access mechanism that provides &quot;Data Access by Reference&quot;. ODBC and JDBC (amongst others) provide &quot;Data Access by Reference&quot; via Data Source Names. Linked Data is about the same thing (URIs are Data Source Names) with the following differences: Naming is scoped to the entity level rather than container level HTTP&#39;s use within the data source naming scheme expands the referencability of the Named Entity Descriptions beyond traditional confines such as applications, operating systems, and database engines. Issue - Giant Global Graph Hugely motivating and powerful idea, worthy of a superhero (Graphius!), but: - giant and global parts are too hard, and starting global makes every problem harder - local projects become unmanageable in global context (Cyc, Freebase data-modeling lists...). And my thus my plea, again. Forget &quot;semantic&quot; and &quot;web&quot;, let&#39;s fix the database tech first: - node/arc data-model, path-based exploratory query-model - data-graph applications built easily on top of this common model; building them has to be easy, because if it&#39;s hard, they&#39;ll be bad - given good database tech, good web data-publishing tech will be trivial! - given good tools for graphs, the problems of uniting them will be only as hard as they have to be. Response: Giant Global Graph is just another moniker for a &quot;Web of Linked Data&quot; or &quot;Linked Data Web&quot;. Multi-Model Database technology that meshes the best of the Graph &amp; Relational Models exist. In a nutshell, this is what Virtuoso is all about and it&#39;s existed for a very long time :-) Virtuoso is also a Virtual DBMS engine (so you can see Heterogeneous Relational Data via Graph Model Context Lenses). Naturally, it is also a Linked Data Deployment platform (or Linked Data Sever). The issue isn&#39;t the &quot;Semantic Web&quot; moniker per se., it&#39;s about how Linked Data (foundation layer of Semantic Web) gets introduced to users. As I said during the MIT Gathering: &quot;The Web is experienced via Web Browsers primarily, so any enhancement to the Web must be exposed via traditional Web Browsers&quot;, which is why we&#39;ve opted to simply add &quot;View Linked Data Sources&quot; to the existing set of common Browser options that includes: View page in rendered form (default) View page source (i.e., how you see the markup behind the page) By exposing the Linked Data Web option as described above, you enable the Web user to knowingly transition from the traditional Rendered (X)HTML page view to the Linked Data View (i.e., structured data behind the page). This simple &quot;User Interaction&quot; tweak makes the notion of exploiting a Structured Web becomes somewhat clearer. The Linked Data Web isn&#39;t a panacea. It&#39;s just an addition to the existing Web that enrichens the things you can do with the Web. It&#39;s predominance, like any application feature, will be subject to the degrees to which it delivers tangible value or matrializes internal and external opportunity costs. Note: The Web isn&#39;t ubiquitous today becuase all it&#39;s users groked HTML Markup. It&#39;s ubquitity is a function of opportunity costs: there simply came a point in the Web boostrap when nobody could afford the opportunity costs associated with being off the Web. The same thing will play out with Linked Data and the broader Semantic Web vision. Links: Linked Data Journey part of my Linked Data Planet Presentation Remix(from slides 15 to 22 - which include bits from TimBL&#39;s presentation) OpenLink Data Explorer OpenLink Data Explorer Screenshots and examples.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>This post is in response to <a href="http://www.furia.com" id="link-id107907b8">Glenn McDonald</a>&#39;s post titled: <a href="http://www.furia.com/page.cgi?type=log&id=308" id="link-id13dcf2d0">Whole Data</a>, where he highlights a number of issues relating to &quot;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id1016c1f0">Semantic Web</a>&quot; marketing communications and overall messaging, from his perspective.</p>
<p>
By coincidence, Glenn and I presented at this month&#39;s Cambridge <a href="http://dbpedia.org/resource/Semantic_Web" id="link-idd526f48">Semantic Web</a> Gathering.</p> 

<p>I&#39;ve provided a dump of Glenn&#39;s issues and my responses below:</p>

<h3>Issue - RDF</h3>

<ul>
<li>Ingenious <a href="http://dbpedia.org/resource/Data">data</a> decomposition idea, but: </li>
<li>too low-level; the assembly language of data, where we need Java or <a href="http://dbpedia.org/resource/Ruby_programming_language" id="link-id103f3dd0">Ruby</a> </li>
<li>&quot;resource&quot; is not the issue; there&#39;s no such thing as &quot;metadata&quot;, it&#39;s all data; &quot;meta&quot; is a perspective </li>
<li>lists need to be effortless, not painful and obscure </li>
<li>nodes need to be represented, not just implied; they need types and literals in a more pervasive, integrated way.  </li>
</ul>

<h4>Response:</h4>
<p>RDF is a Graph based Data Model it stands for Resource Description Framework. The Metadata data angle comes from it&#39;s <a href="http://dbpedia.org/resource/Meta_Content_Framework" id="link-id1690df60">Meta Content Framework (MCF)</a> origins. You can express and serialize data based on the RDF Data Model using: Turtle, N3, TriX, N-Triples, and RDF/XML.</p>

<h3>Issue - <a href="http://dbpedia.org/resource/SPARQL" id="link-id10234b38">SPARQL</a> (and Freebase&#39;s MQL)</h3>
<p>These are just appeasement: <br />- old query paradigm: fishing in dark water with superstitiously tied lures; only works well in carefully stocked lakes <br />- we don&#39;t ask questions by defining answer shapes and then hoping they&#39;re dredged up whole.</p>

<h4>Response:</h4>
<p>
<a href="http://dbpedia.org/resource/SPARQL" id="link-id16e45e50">SPARQL</a>, <a href="http://www.freebase.com/view/freebase/api" id="link-id13e7d468">MQL</a>, and <a href="http://msdn.microsoft.com/en-us/library/bb387145.aspx" id="link-id1516fbd8">Entity-SQL</a> are Graph Model oriented Query Languages. Query Languages always accompany Database Engines. <a href="http://dbpedia.org/resource/SQL" id="link-id13f8c100">SQL</a> is the Relational Model equivalent. </p>

<h3>Issue - <a href="http://dbpedia.org/resource/Linked_Data" id="link-id171dee68">Linked Data</a>
</h3>
<p>Noble attempt to ground the abstract, but: <br />- <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id1576d5f8">URI</a> dereferencing/namespace/<a href="http://dbpedia.org/resource/Open_world_assumption" id="link-id15f50180">open-world</a> issues focus too much technical attention on cross-source cases where the human issues dwarf the technical ones anyway <br />- <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id105df458">FOAF</a> query over the people in this room? forget it. <br />- link asymmetry doesn&#39;t scale <br />- identity doesn&#39;t scale <br />- generating RDF from non-graph sources: more appeasement, right where the win from actually converting could be biggest! </p>

<h4>Response:</h4>
<p>Innovative use of HTTP to deliver &quot;<a href="http://dbpedia.org/resource/Reference_%28computer_science%29" id="link-id13eeab20">Data Access by Reference</a>&quot; to the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id13492610">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id105dfc10">Web</a>.</p>
<p>When you have a Data Model, Database Engine, and Query Language, the next thing you need is a Data Access mechanism that provides &quot;<a href="http://dbpedia.org/resource/Reference_(computer_science)" id="link-id100ef2c0">Data Access by Reference</a>&quot;. <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id16692e88">ODBC</a> and <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id1699b970">JDBC</a> (amongst others) provide &quot;<a href="http://dbpedia.org/resource/Reference_(computer_science)" id="link-id16034b48">Data Access by Reference</a>&quot; via Data Source Names. <a href="http://dbpedia.org/resource/Linked_Data" id="link-id16690118">Linked Data</a> is about the same thing (URIs are Data Source Names) with the following differences:</p>
<ul>
<li>Naming is scoped to the <a href="http://dbpedia.org/resource/Entity" id="link-id1195dc48">entity</a> level rather than container level</li>
<li>HTTP&#39;s use within the data source naming scheme expands the referencability of the Named <a href="http://dbpedia.org/resource/Entity" id="link-id10485760">Entity</a> Descriptions beyond traditional confines such as applications, operating systems, and database engines.  </li>
</ul>

<h3>
Issue - <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id104684d0">Giant Global Graph</a>
</h3>

<p>Hugely motivating and powerful idea, worthy of a superhero (Graphius!), but: <br />- giant and global parts are too hard, and starting global makes every problem harder <br />- local projects become unmanageable in global <a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id12497088">context</a> (Cyc, Freebase data-modeling lists...).

And my thus my plea, again. Forget &quot;semantic&quot; and &quot;<a href="http://dbpedia.org/resource/World_Wide_Web">web</a>&quot;, let&#39;s fix the database tech first: <br />- node/arc data-model, path-based exploratory query-model <br />- data-graph applications built easily on top of this common model; building them has to be easy, because if it&#39;s hard, they&#39;ll be bad <br />- given good database tech, good web data-publishing tech will be trivial! <br />- given good tools for graphs, the problems of uniting them will be only as hard as they have to be.</p>

<h4>Response:</h4>
<p>
<a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id144466d8">Giant Global Graph</a> is just another moniker for a &quot;Web of <a href="http://dbpedia.org/resource/Linked_Data" id="link-id15c2c738">Linked Data</a>&quot; or &quot;<a href="http://dbpedia.org/resource/Linked_Data" id="link-id14e73520">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id10aef200">Web</a>&quot;.</p>

<p>Multi-Model Database technology that meshes the best of the Graph &amp; Relational Models exist. In a nutshell, this is what <a href="http://virtuoso.openlinksw.com" id="link-id13492e10">Virtuoso</a> is all about and it&#39;s existed for a very long time :-)</p>

<p>
<a href="http://virtuoso.openlinksw.com" id="link-id105a4f58">Virtuoso</a> is also a Virtual DBMS engine (so you can see Heterogeneous Relational Data via Graph Model <a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id15845110">Context</a> Lenses). Naturally, it is also a <a href="http://dbpedia.org/resource/Linked_Data" id="link-id109e2c78">Linked Data</a> Deployment platform (or <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1086d650">Linked Data</a> Sever). </p>
<p>The issue isn&#39;t the &quot;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id107f1ba8">Semantic Web</a>&quot; moniker per se., it&#39;s about how <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xba72818">Linked Data</a> (foundation layer of <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id101dbf50">Semantic Web</a>) gets introduced to users. As I said during the MIT Gathering: &quot;The Web is experienced via Web Browsers primarily, so any enhancement to the Web must be exposed via traditional Web Browsers&quot;, which is why we&#39;ve opted to simply add &quot;View <a href="http://dbpedia.org/resource/Linked_Data">Linked Data</a> Sources&quot; to the existing set of common Browser options that includes:</p>
<ol>
<li>View page in rendered form (default)</li>
<li>View page source (i.e., how you see the markup behind the page)</li>
</ol>

<p>By exposing the Linked Data <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id15a04b70">Web</a> option as described above, you enable the Web user to knowingly transition from the traditional Rendered (X)HTML page view to the Linked Data View (i.e., structured data behind the page). This simple &quot;User Interaction&quot; tweak makes the notion of exploiting a Structured Web becomes somewhat clearer.</p>

<p>The Linked Data <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id10a187d0">Web</a> isn&#39;t a panacea. It&#39;s just an addition to the existing Web that enrichens the things you can do with the Web. It&#39;s predominance, like any application feature, will be subject to the degrees to which it delivers tangible value or matrializes internal and external opportunity costs.</p>
<p>Note: The Web isn&#39;t ubiquitous today becuase all it&#39;s users groked HTML Markup. It&#39;s ubquitity is a function of opportunity costs: there simply came a point in the Web boostrap when nobody could afford the opportunity costs associated with being off the Web.  The same thing will play out with Linked Data and the broader <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id10a97330">Semantic Web</a> vision.</p>

<b>Links:</b>

<ol>
<li>
  <a href="http://virtuoso.openlinksw.com/presentations/Creating_Deploying_Exploiting_Linked_Data2/Creating_Deploying_Exploiting_Linked_Data2_TimBL_v3.html(15)" id="link-id137fc560">Linked Data Journey part of my Linked Data Planet Presentation Remix</a>(from slides 15 to 22 - which include bits from <a href="http://www.w3.org/People/Berners-Lee/card#i" id="link-id1048a968">TimBL</a>&#39;s presentation)</li>
<li>
  <a href="http://ode.openlinksw.com" id="link-id1667df98">OpenLink Data Explorer</a>
</li>
<li>
  <a href="http://ode.openlinksw.com/example.html" id="link-id137ee860">OpenLink Data Explorer Screenshots and examples</a>.</li>
</ol>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-06-09#1381">
  <rss:title>The DARQ Matter of Federation</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-06-09T14:02:19Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">The DARQ Matter of Federation Astronomers propose that the universe is held together, so to speak, by the gravity of invisible &quot;dark matter&quot; spread in interstellar and intergalactic space. For the data web, it will be held together by federation, also an invisible factor. As in Minkowski space, so in cyberspace. To take the astronomical analogy further, putting too much visible stuff in one place makes a black hole, whose chief properties are that it is very heavy, can only get heavier and that nothing comes out. DARQ is Bastian Quilitz&#39;s federated extension of the Jena ARQ SPARQL processor. It has existed for a while and was also presented at ESWC2008. There is also SPARQL FED from Andy Seaborne, an explicit means of specifying which end point will process which fragment of a distributed SPARQL query. Still, for federation to deliver in an open, decentralized world, it must be transparent. For a specific application, with a predictable workload, it is of course OK to partition queries explicitly. Bastian had split DBpedia among five Virtuoso servers and was querying this set with DARQ. The end result was that there was a rather frightful cost of federation as opposed to all the data residing in a single Virtuoso. The other result was that if selectivity of predicates was not correctly guessed by the federation engine, the proposition was a non-starter. With correct join order it worked, though. Yet, we really want federation. Looking further down the road, we simply must make federation work. This is just as necessary as running on a server cluster for mid-size workloads. Since we are convinced of the cause, let&#39;s talk about the means. For DARQ as it now stands, there&#39;s probably an order of magnitude or even more to gain from a couple of simple tricks. If going to a SPARQL end point that is not the outermost in the loop join sequence, batch the requests together in one HTTP/1.1 message. So, if the query is &quot;get me my friends living in cities of over a million people,&quot; there will be the fragment &quot;get city where x lives&quot; and later &quot;ask if population of x greater than 1000000&quot;. If I have 100 friends, I send the 100 requests in a batch to each eligible server. Further, if running against a server of known brand, use a client-server connection and prepared statements with array parameters. This can well improve the processing speed at the remote end point by another order of magnitude. This gain may however not be as great as the latency savings from message batching. We will provide a sample of how to do this with Virtuoso over JDBC so Bastian can try this if interested. These simple things will give a lot of mileage and may even decide whether federation is an option in specific applications. For the open web however, these measures will not yet win the day. When federating SQL, colocation of data is sort of explicit. If two tables are joined and they are in the same source, then the join can go to the source. For SPARQL this is also so but with a twist: If a foaf:Person is found on a given server, this does not mean that the Person&#39;s geek code or email hash will be on the same server. Thus {?p name &quot;Johnny&quot; . ?p geekCode ?g . ?p emailHash ?h } does not necessarily denote a colocated join if many servers serve items of the vocabulary. However, in most practical cases, for obtaining a rapid answer, treating this as a colocated fragment will be appropriate. Thus, it may be necessary to be able to declare that geek codes will be assumed colocated with names. This will save a lot of message passing and offer decent, if not theoretically total recall. For search style applications, starting with such assumptions will make sense. If nothing is found, then we can partition each join step separately for the unlikely case that there were a server that gave geek codes but not names. For Virtuoso, we find that a federated query&#39;s asynchronous, parallel evaluation model is not so different from that on a local cluster. So the cluster version could have the option of federated query. The difference is that a cluster is local and tightly coupled and predictably partitioned but a federated setting is none of these. For description, we would take DARQ&#39;s description model and maybe extend it a little where needed. Also we would enhance the protocol to allow just asking for the query cost estimate given a query with literals specified. We will do this eventually. We would like to talk to Bastian about large improvements to DARQ, specially when working with Virtuoso. We&#39;ll see. Of course, one mode of federating is the crawl-as-you-go approach of the Virtuoso Sponger. This will bring in fragments following seeAlso or sameAs declarations or other references. This will however not have the recall of a warehouse or federation over well described SPARQL end-points. But up to a certain volume it has the speed of local storage. The emergence of voiD (Vocabulary of Interlinked Data) is a step in the direction of making federation a reality. There is a separate post about this.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<div style="display:none;">The DARQ Matter of Federation</div>
<p>Astronomers propose that the universe is held together, so to speak, by the gravity of invisible &quot;dark matter&quot; spread in interstellar and intergalactic space.</p>
<p>For the <a href="http://dbpedia.org/resource/Data" id="link-id0x19dbf410">data</a> web, it will be held together by federation, also an invisible factor. As in Minkowski space, so in <a href="http://dbpedia.org/resource/Cyberspace" id="link-id0x9fc13ff8">cyberspace</a>.</p>
<p>To take the astronomical analogy further, putting too much visible stuff in one place makes a black hole, whose chief properties are that it is very heavy, can only get heavier and that nothing comes out.</p>
<p>
  <a href="http://darq.sourceforge.net/" id="link-id0x1d06bd88">DARQ</a> is Bastian Quilitz&#39;s federated extension of the <a href="http://jena.sourceforge.net/" id="link-id0x1cf28f70">Jena</a> <a href="http://jena.sourceforge.net/ARQ/" id="link-id0x1cba22c8">ARQ</a> <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x171c7dc8">SPARQL</a> processor. It has existed for a while and was also presented at <a href="http://www.eswc2008.org/" id="link-id0x1ed53cd0">ESWC2008</a>. There is also SPARQL FED from Andy Seaborne, an explicit means of specifying which end point will process which fragment of a distributed SPARQL query. Still, for federation to deliver in an open, decentralized world, it must be transparent. For a specific application, with a predictable workload, it is of course OK to partition queries explicitly.</p>
<p>Bastian had split <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x1ce846c0">DBpedia</a> among five <a href="http://virtuoso.openlinksw.com" id="link-id0x1cad0640">Virtuoso</a> servers and was querying this set with DARQ. The end result was that there was a rather frightful cost of federation as opposed to all the data residing in a single Virtuoso. The other result was that if selectivity of predicates was not correctly guessed by the federation engine, the proposition was a non-starter. With correct join order it worked, though.</p>
<p>Yet, we really want federation. Looking further down the road, we simply must make federation work. This is just as necessary as running on a server cluster for mid-size workloads.</p>
<p>Since we are convinced of the cause, let&#39;s talk about the means.</p>
<p>For DARQ as it now stands, there&#39;s probably an order of magnitude or even more to gain from a couple of simple tricks. If going to a SPARQL end point that is not the outermost in the loop join sequence, batch the requests together in one <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x19a48280">HTTP</a>/1.1 message. So, if the query is &quot;get me my friends living in cities of over a million people,&quot; there will be the fragment &quot;get city where x lives&quot; and later &quot;ask if population of x greater than 1000000&quot;. If I have 100 friends, I send the 100 requests in a batch to each eligible server.</p>
<p>Further, if running against a server of known brand, use a client-server connection and prepared statements with array parameters. This can well improve the processing speed at the remote end point by another order of magnitude. This gain may however not be as great as the latency savings from message batching. We will provide a sample of how to do this with Virtuoso over <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id0x1cf18278">JDBC</a> so Bastian can try this if interested.</p>
<p>These simple things will give a lot of mileage and may even decide whether federation is an option in specific applications. For the open web however, these measures will not yet win the day.</p>
<p>When federating <a href="http://dbpedia.org/resource/SQL" id="link-id0x1cf7d0e8">SQL</a>, colocation of data is sort of explicit. If two tables are joined and they are in the same source, then the join can go to the source. For SPARQL this is also so but with a twist:</p>
<p>If a foaf:Person is found on a given server, this does not mean that the Person&#39;s geek code or email hash will be on the same server. Thus <code>{?p name &quot;Johnny&quot; . ?p geekCode ?g . ?p emailHash ?h }</code> does not necessarily denote a colocated join if many servers serve items of the vocabulary.</p>
<p>However, in most practical cases, for obtaining a rapid answer, treating this as a colocated fragment will be appropriate. Thus, it may be necessary to be able to declare that geek codes will be assumed colocated with names. This will save a lot of message passing and offer decent, if not theoretically total recall. For search style applications, starting with such assumptions will make sense. If nothing is found, then we can partition each join step separately for the unlikely case that there were a server that gave geek codes but not names.</p>
<p>For Virtuoso, we find that a federated query&#39;s asynchronous, parallel evaluation model is not so different from that on a local cluster. So the cluster version could have the option of federated query. The difference is that a cluster is local and tightly coupled and predictably partitioned but a federated setting is none of these.</p>
<p>For description, we would take DARQ&#39;s description model and maybe extend it a little where needed. Also we would enhance the protocol to allow just asking for the query cost estimate given a query with literals specified. We will do this eventually.</p>
<p>We would like to talk to Bastian about large improvements to DARQ, specially when working with Virtuoso. We&#39;ll see.</p>
<p>Of course, one mode of federating is the crawl-as-you-go approach of the Virtuoso <a href="http://virtuoso.openlinksw.com/Whitepapers/html/VirtSpongerWhitePaper.html" id="link-id0x1e163140">Sponger</a>. This will bring in fragments following seeAlso or sameAs declarations or other references. This will however not have the recall of a warehouse or federation over well described SPARQL end-points. But up to a certain volume it has the speed of local storage.</p>
<p>The emergence of voiD (Vocabulary of Interlinked Data) is a step in the direction of making federation a reality. There is <a href="http://www.openlinksw.com/weblog/oerling/?id=1377" id="link-id1109a4c8">a separate post</a> about this.</p>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-06-09#1376">
  <rss:title>The DARQ Matter of Federation</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-06-09T13:57:30Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Astronomers propose that the universe is held together, so to speak, by the gravity of invisible &quot;dark matter&quot; spread in interstellar and intergalactic space. For the data web, it will be held together by federation, also an invisible factor. As in Minkowski space, so in cyberspace. To take the astronomical analogy further, putting too much visible stuff in one place makes a black hole, whose chief properties are that it is very heavy, can only get heavier and that nothing comes out. DARQ is Bastian Quilitz&#39;s federated extension of the Jena ARQ SPARQL processor. It has existed for a while and was also presented at ESWC2008. There is also SPARQL FED from Andy Seaborne, an explicit means of specifying which end point will process which fragment of a distributed SPARQL query. Still, for federation to deliver in an open, decentralized world, it must be transparent. For a specific application, with a predictable workload, it is of course OK to partition queries explicitly. Bastian had split DBpedia among five Virtuoso servers and was querying this set with DARQ. The end result was that there was a rather frightful cost of federation as opposed to all the data residing in a single Virtuoso. The other result was that if selectivity of predicates was not correctly guessed by the federation engine, the proposition was a non-starter. With correct join order it worked, though. Yet, we really want federation. Looking further down the road, we simply must make federation work. This is just as necessary as running on a server cluster for mid-size workloads. Since we are convinced of the cause, let&#39;s talk about the means. For DARQ as it now stands, there&#39;s probably an order of magnitude or even more to gain from a couple of simple tricks. If going to a SPARQL end point that is not the outermost in the loop join sequence, batch the requests together in one HTTP/1.1 message. So, if the query is &quot;get me my friends living in cities of over a million people,&quot; there will be the fragment &quot;get city where x lives&quot; and later &quot;ask if population of x greater than 1000000&quot;. If I have 100 friends, I send the 100 requests in a batch to each eligible server. Further, if running against a server of known brand, use a client-server connection and prepared statements with array parameters. This can well improve the processing speed at the remote end point by another order of magnitude. This gain may however not be as great as the latency savings from message batching. We will provide a sample of how to do this with Virtuoso over JDBC so Bastian can try this if interested. These simple things will give a lot of mileage and may even decide whether federation is an option in specific applications. For the open web however, these measures will not yet win the day. When federating SQL, colocation of data is sort of explicit. If two tables are joined and they are in the same source, then the join can go to the source. For SPARQL this is also so but with a twist: If a foaf:Person is found on a given server, this does not mean that the Person&#39;s geek code or email hash will be on the same server. Thus {?p name &quot;Johnny&quot; . ?p geekCode ?g . ?p emailHash ?h } does not necessarily denote a colocated join if many servers serve items of the vocabulary. However, in most practical cases, for obtaining a rapid answer, treating this as a colocated fragment will be appropriate. Thus, it may be necessary to be able to declare that geek codes will be assumed colocated with names. This will save a lot of message passing and offer decent, if not theoretically total recall. For search style applications, starting with such assumptions will make sense. If nothing is found, then we can partition each join step separately for the unlikely case that there were a server that gave geek codes but not names. For Virtuoso, we find that a federated query&#39;s asynchronous, parallel evaluation model is not so different from that on a local cluster. So the cluster version could have the option of federated query. The difference is that a cluster is local and tightly coupled and predictably partitioned but a federated setting is none of these. For description, we would take DARQ&#39;s description model and maybe extend it a little where needed. Also we would enhance the protocol to allow just asking for the query cost estimate given a query with literals specified. We will do this eventually. We would like to talk to Bastian about large improvements to DARQ, specially when working with Virtuoso. We&#39;ll see. Of course, one mode of federating is the crawl-as-you-go approach of the Virtuoso Sponger. This will bring in fragments following seeAlso or sameAs declarations or other references. This will however not have the recall of a warehouse or federation over well described SPARQL end-points. But up to a certain volume it has the speed of local storage. The emergence of voiD (Vocabulary of Interlinked Data) is a step in the direction of making federation a reality. There is a separate post about this.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Astronomers propose that the universe is held together, so to speak, by the gravity of invisible &quot;dark matter&quot; spread in interstellar and intergalactic space.</p>
<p>For the <a href="http://dbpedia.org/resource/Data" id="link-id0x19bbd830">data</a> web, it will be held together by federation, also an invisible factor. As in Minkowski space, so in <a href="http://dbpedia.org/resource/Cyberspace" id="link-id0x19af2488">cyberspace</a>.</p>
<p>To take the astronomical analogy further, putting too much visible stuff in one place makes a black hole, whose chief properties are that it is very heavy, can only get heavier and that nothing comes out.</p>
<p>
<a href="http://darq.sourceforge.net/" id="link-id0x19b7a9c8">DARQ</a> is Bastian Quilitz&#39;s federated extension of the <a href="http://jena.sourceforge.net/" id="link-id0x19ce3da0">Jena</a> <a href="http://jena.sourceforge.net/ARQ/" id="link-id0xa569a258">ARQ</a> <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1a8d2270">SPARQL</a> processor. It has existed for a while and was also presented at <a href="http://www.eswc2008.org/" id="link-id0x1aad1d00">ESWC2008</a>. There is also SPARQL FED from Andy Seaborne, an explicit means of specifying which end point will process which fragment of a distributed SPARQL query. Still, for federation to deliver in an open, decentralized world, it must be transparent. For a specific application, with a predictable workload, it is of course OK to partition queries explicitly.</p>
<p>Bastian had split <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x1a8ac770">DBpedia</a> among five <a href="http://virtuoso.openlinksw.com" id="link-id0x19601d30">Virtuoso</a> servers and was querying this set with DARQ. The end result was that there was a rather frightful cost of federation as opposed to all the data residing in a single Virtuoso. The other result was that if selectivity of predicates was not correctly guessed by the federation engine, the proposition was a non-starter. With correct join order it worked, though.</p>
<p>Yet, we really want federation. Looking further down the road, we simply must make federation work. This is just as necessary as running on a server cluster for mid-size workloads.</p>
<p>Since we are convinced of the cause, let&#39;s talk about the means.</p>
<p>For DARQ as it now stands, there&#39;s probably an order of magnitude or even more to gain from a couple of simple tricks. If going to a SPARQL end point that is not the outermost in the loop join sequence, batch the requests together in one <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x19b94818">HTTP</a>/1.1 message. So, if the query is &quot;get me my friends living in cities of over a million people,&quot; there will be the fragment &quot;get city where x lives&quot; and later &quot;ask if population of x greater than 1000000&quot;. If I have 100 friends, I send the 100 requests in a batch to each eligible server.</p>
<p>Further, if running against a server of known brand, use a client-server connection and prepared statements with array parameters. This can well improve the processing speed at the remote end point by another order of magnitude. This gain may however not be as great as the latency savings from message batching. We will provide a sample of how to do this with Virtuoso over <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id0x17822258">JDBC</a> so Bastian can try this if interested.</p>
<p>These simple things will give a lot of mileage and may even decide whether federation is an option in specific applications. For the open web however, these measures will not yet win the day.</p>
<p>When federating <a href="http://dbpedia.org/resource/SQL" id="link-id0x1a651628">SQL</a>, colocation of data is sort of explicit. If two tables are joined and they are in the same source, then the join can go to the source. For SPARQL this is also so but with a twist:</p>
<p>If a foaf:Person is found on a given server, this does not mean that the Person&#39;s geek code or email hash will be on the same server. Thus <code>{?p name &quot;Johnny&quot; . ?p geekCode ?g . ?p emailHash ?h }</code> does not necessarily denote a colocated join if many servers serve items of the vocabulary.</p>
<p>However, in most practical cases, for obtaining a rapid answer, treating this as a colocated fragment will be appropriate. Thus, it may be necessary to be able to declare that geek codes will be assumed colocated with names. This will save a lot of message passing and offer decent, if not theoretically total recall. For search style applications, starting with such assumptions will make sense. If nothing is found, then we can partition each join step separately for the unlikely case that there were a server that gave geek codes but not names.</p>
<p>For Virtuoso, we find that a federated query&#39;s asynchronous, parallel evaluation model is not so different from that on a local cluster. So the cluster version could have the option of federated query. The difference is that a cluster is local and tightly coupled and predictably partitioned but a federated setting is none of these.</p>
<p>For description, we would take DARQ&#39;s description model and maybe extend it a little where needed. Also we would enhance the protocol to allow just asking for the query cost estimate given a query with literals specified. We will do this eventually.</p>
<p>We would like to talk to Bastian about large improvements to DARQ, specially when working with Virtuoso. We&#39;ll see.</p>
<p>Of course, one mode of federating is the crawl-as-you-go approach of the Virtuoso <a href="http://virtuoso.openlinksw.com/Whitepapers/html/VirtSpongerWhitePaper.html" id="link-id0x1dddce48">Sponger</a>. This will bring in fragments following seeAlso or sameAs declarations or other references. This will however not have the recall of a warehouse or federation over well described SPARQL end-points. But up to a certain volume it has the speed of local storage.</p>
<p>The emergence of voiD (Vocabulary of Interlinked Data) is a step in the direction of making federation a reality. There is <a href="http://www.openlinksw.com/weblog/oerling/?id=1377" id="link-id1109a4c8">a separate post</a> about this.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-05-20#1364">
  <rss:title>ODBC &amp; WODBC Comparison</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-05-20T19:37:53Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">ODBC delivers open data access (by reference) to a broad range of enterprise databases via a &#39;C&#39; based API. Thanks to the iODBC and unixODBC projects, ODBC is available across broad range of platforms beyond Windows. ODBC identifies data sources using Data Source Names (DSNs). WODBC (Web Open Database Connectivity) delivers open data access to Web Databases / Data Spaces. The Data Source Naming scheme: URI or IRI, is HTTP based thereby enabling data access by reference via the Web. ODBC DSNs bind ODBC client applications to Tables, Views, Stored Procedures. WODBC DSNs bind you to a Data Space (e.g. my FOAF based Profile Page where you can use the &quot;Explore Data Tab&quot; to look around if you are a human visitor) or a specific Entity within a Data Space (i.e Person Entity Me). ODBC Drivers are built using APIs (DBMS Call Level Interfaces) provided by DBMS vendors. Thus, a DBMS vendor can chose not to release an API, or do so selectivity, for competitive advantage or market disruption purposes (it&#39;s happened!). WODBC Drivers are also built using APIs (Web Services associated with a Web Data Space). These drivers are also referred to as RDF Middleware or RDFizers. The &quot;Web&quot; component of WODBC ensures openness, you publish Data with URIs from your Linked Data Server and that&#39;s it; your data space or specific data entities are live and accessible (by reference) over the Web! So we have come full circle (or cycle), the Web is becoming more of a structured database everyday! What&#39;s new is old, and what&#39;s old is new! Data Access is everything, without &quot;Data&quot; there is no information or knowledge. Without &quot;Data&quot; there&#39;s not notion of vitality, purpose, or value. URIs make or break everything in the Linked Data Web just as ODBC DSNs do within the enterprise. I&#39;ve deliberately left JDBC, ADO.NET, and OLE-DB out of this piece due to their respective programming languages and frameworks specificity. None of these mechanisms match the platform availability breadth of ODBC. The Web as a true M-V-C pattern is now crystalizing. The &quot;M&quot; (Model) component of M-V-C is finally rising to the realm of broad attention courtesy of the &quot;Linked Data&quot; meme and &quot;Semantic Web&quot; vision. By the way, M-V-C lines up nicely with Web 1.0 (Web Forms / Pages), Web 2.0 (Web Services based APIs), and Web 3.0 (Data Web, Web of Data, or Linked Data Web) :-)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id100eb550">ODBC</a> delivers open <a href="http://dbpedia.org/resource/Data" id="link-idffd2338">data</a> access (by reference) to a broad range of enterprise databases via  a &#39;<a href="http://dbpedia.org/resource/C_(programming_language)" id="link-id104fd1d8">C</a>&#39; based API. Thanks to the <a href="http://www.iodbc.org" id="link-id104721b0">iODBC</a> and <a href="http://www.unixodbc.org" id="link-id10954990">unixODBC</a> projects, <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id10494670">ODBC</a> is available across broad range of platforms beyond Windows.</p>

<p>
<a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id0xc900928">ODBC</a> identifies <a href="http://dbpedia.org/resource/Data" id="link-id10f82200">data</a> sources using <a href="http://dbpedia.org/resource/Data" id="link-id0xcaad080">Data</a> Source Names (DSNs). </p>
<p>
WODBC (<a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> Open Database Connectivity) delivers open <a href="http://dbpedia.org/resource/Data">data</a> access to Web Databases / Data Spaces. The Data Source Naming scheme: <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id1009ce40">URI</a> or IRI,  is <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id101fc1b0">HTTP</a> based thereby enabling data access by reference via the Web. </p>

<p><a href="http://dbpedia.org/resource/Open_Database_Connectivity">ODBC</a> DSNs bind ODBC client applications to Tables, Views, Stored Procedures. </p>

<p>WODBC DSNs bind you to a Data <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id10182a88">Space</a> (e.g. my <a href="http://myopenlink.net/dataspace/person/kidehen" id="link-id105a7858">FOAF based Profile Page</a> where you can use the &quot;Explore Data Tab&quot; to look around if you are a human visitor) or a specific <a href="http://dbpedia.org/resource/Entity" id="link-id10bd8578">Entity</a> within a Data <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id10780dc0">Space</a> (i.e <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-id10848e08">Person Entity Me</a>).</p>

<p>ODBC Drivers are built using APIs (DBMS Call Level Interfaces) provided by DBMS vendors. Thus, a DBMS vendor can chose not to release an API, or do so selectivity, for competitive advantage or market disruption purposes (it&#39;s happened!).</p>

<p>WODBC Drivers are also built using APIs (Web Services associated with a Web Data <a href="http://en.wikipedia.org/wiki/Data_Spaces" id="link-id0xcbe6348">Space</a>). These drivers are also referred to as <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=rdf%20middleware&type=text&output=html" id="link-id16564058">RDF Middleware</a> or RDFizers. The &quot;Web&quot; component of WODBC ensures openness, you publish Data with URIs from your <a href="http://dbpedia.org/resource/Linked_Data" id="link-id1064a768">Linked Data</a> Server and that&#39;s it; your data <a href="http://en.wikipedia.org/wiki/Data_Spaces">space</a> or specific data entities are live and accessible (by reference) over the Web!</p>

<p>So we have come full circle (or cycle), the Web is becoming more of a structured database everyday! What&#39;s new is old, and what&#39;s old is new! </p>

<p>Data Access is everything, without &quot;Data&quot; there is no <a href="http://dbpedia.org/resource/Information" id="link-id100a9de8">information</a> or <a href="http://dbpedia.org/resource/Knowledge" id="link-id10bb67e8">knowledge</a>. Without &quot;Data&quot; there&#39;s not notion of vitality, purpose, or value.</p>

<p>URIs make or break everything in the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id10a71638">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id10494400">Web</a> just as ODBC DSNs do within the enterprise.
</p>
<p>I&#39;ve deliberately left <a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id10a05280">JDBC</a>, <a href="http://dbpedia.org/resource/ADO.NET" id="link-id104e4a70">ADO</a>.<a href="http://dbpedia.org/resource/ADO.NET" id="link-id10215668">NET</a>, and OLE-DB out of this piece due to their respective programming languages and frameworks specificity. None of these mechanisms match the platform availability breadth of ODBC.</p>

<p>The Web as a true <a href="http://dbpedia.org/resource/Model-view-controller" id="link-id108ee598">M</a>-<a href="http://dbpedia.org/resource/Model-view-controller" id="link-id0xcda5e90">V</a>-C pattern is now crystalizing. The &quot;M&quot; (Model) component of M-V-C is finally rising to the realm of broad attention courtesy of the &quot;<a href="http://www.w3.org/DesignIssues/LinkedData.html" id="link-id1024ff08">Linked Data&quot; meme</a> and &quot;<a href="http://dbpedia.org/resource/Semantic_Web" id="link-id1831b418">Semantic Web</a>&quot; vision.</p>

<p>By the way, M-V-C lines up nicely with Web 1.0 (Web Forms / Pages), Web 2.0 (Web Services based APIs), and Web 3.0 (Data Web, Web of Data, or <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xb6d0e90">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id0xb22a158">Web</a>) :-)</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-05-16#1362">
  <rss:title>Commercializing the Semantic Web</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-05-16T20:02:45Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Unfortunately, I could only spend 4 days at the recent WWW2008 event in Beijing (I departed the morning following the Linked Data Workshop), so I couldn&#39;t take my slot on the &quot;Commercializing the Semantic Web panel&quot; etc.. Anyway, thanks to the Web I can still inject my points of view in the broad Web based discourse. Well so I hoped, when I attempted to post a comment to Paul Miller&#39;s ZDNet domain hosted blog thread titled: Commercialising the Semantic Web. Unfortunately, the cost of completing ZDNet&#39;s unwieldy signup process simply exceeded the benefits of dropping my comments in their particular space :-( Thus, I&#39;ll settle for a trackback ping instead. What follows is the cut and paste of my intended comment contributions to Paul&#39;s post. Paul, As discussed earlier this week during our podcast session, commercialization of Semantic Web technology shouldn&#39;t be a mercurial matter at this stage in the game :-) It&#39;s all about looking at how it provides value :-) From the Linked Data angle, the ability to produce, dispatch, and exploit &quot;Context&quot; across an array of &quot;Perspectives&quot; from a plethora of disparate data sources on the Web and/or behind corporate firewalls, offers immense commercial value. Yahoo&#39;s Searchmonkey effort will certainly bring clarity to some of the points I made during the podcast re. the role of URIs as &quot;value consumption tickets&quot; (Data Services are exposed via URIs). There has to be a trigger (in user space) that compels Web users to seek broader, or simply varied, perspectives as a response to data encountered on the Web. Yahoo! is about to put this light on in a big way (imho). The &quot;self annotating&quot; nature of the Web is what ultimately drives the manifestation of the long awaited Semantic Web. I believe I postulated about &quot;Self Annotation &amp; the Semantic Web&quot; in a number of prior posts which, by the way, should be DataRSS compatible right now due to Yahoo&#39;s support of OpenSearch Data Providers (which this Blog Space has been for eons). Today, have many communities adding strucuture to the Web (via their respective tools of preference) without explicitly realizing what they are contributing. Every RSS/Atom feed, Tag, Weblog, Shared Bookmark, Wikiword, Microformat, Microformat++ (eRDF or RDFa), GRDDL stylesheet, and RDFizer etc.. is a piece of structured data. Finally, the different communities are all finding ways to work together (thank heavens!) and the results are going to be cataclysmic when it all plays out :-) Data, Structure, and Extraction are the keys to the Semantic Life! First you get the Data in a container (information resource), and then you add Structure to the information resource (RSS, Atom, microformats, RDFa, eRDF, SIOC, FOAF, etc.), once you have Structure RDFization (i.e. transformation to Linked Data) is a synch thanks to RDF Middleware (as per earlier RDF middleware posts).</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Unfortunately, I could only spend 4 days at the recent <a href="http://www2008.org/" id="link-id196acf60">WWW2008</a> event in <a href="http://dbpedia.org/resource/Beijing" id="link-id1974fe28">Beijing</a> (I departed the morning following the <a href="http://events.linkeddata.org/ldow2008/" id="link-id1863f858">Linked Data Workshop</a>), so I couldn&#39;t take my slot on the &quot;Commercializing the <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id18990f90">Semantic Web</a> panel&quot; etc.. Anyway, thanks to the <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> I can still inject my points of view in the broad Web based discourse. Well so I hoped, when I attempted to post a comment to Paul Miller&#39;s ZDNet domain hosted <a href="http://dbpedia.org/resource/Blog" id="link-id180d6750">blog</a> thread titled: <a href="http://blogs.zdnet.com/semantic-web/?p=132" id="link-id12d206c0">Commercialising the Semantic Web</a>.</p> <p>Unfortunately, the cost of completing ZDNet&#39;s unwieldy signup process simply exceeded the benefits of dropping my comments in their particular space :-( Thus, I&#39;ll settle for a trackback ping instead.</p>
<p>What follows is the cut and paste of my intended comment contributions to Paul&#39;s post.</p> 
<p>Paul,</p>
<p>
As discussed earlier this week during <a href="http://blogs.talis.com/nodalities/2008/05/kingsley-idehen-talks-about-openlink-software-linked-data-and-the-semantic-web.php" id="link-id1332fb48">our podcast session</a>, commercialization of <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id17382338">Semantic Web</a> technology shouldn&#39;t be a mercurial matter at this stage in the game :-) It&#39;s all about looking at how it provides value :-)</p>

<p>From the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id10d4f4a8">Linked Data</a> angle, the ability to produce, dispatch, and exploit &quot;<a href="http://dbpedia.org/resource/Context_%28language_use%29" id="link-id13bed160">Context</a>&quot; across an array of &quot;Perspectives&quot; from a plethora of disparate <a href="http://dbpedia.org/resource/Data" id="link-id1731e5f0">data</a> sources on the Web and/or behind corporate firewalls, offers immense commercial value.</p>

<p>
<a href="http://developer.yahoo.com/searchmonkey/" id="link-id1975d248">Yahoo&#39;s Searchmonkey</a> effort will certainly bring clarity to some of the points I made during the podcast re. the role of URIs as &quot;value consumption tickets&quot; (<a href="http://dbpedia.org/resource/Data" id="link-id173eb7b0">Data</a> Services are exposed via URIs). There has to be a trigger (in user space) that compels Web users to seek broader, or simply varied, perspectives as a response to <a href="http://dbpedia.org/resource/Data" id="link-id0x1c7e7f60">data</a> encountered on the Web. Yahoo! is about to put this light on in a big way (imho).</p>

<p>The &quot;self annotating&quot; nature of the Web is what ultimately drives the manifestation of the long awaited <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0xa18a83e8">Semantic Web</a>. I believe I postulated about <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=self%20annotation&type=text&output=html" id="link-id173d7458">&quot;Self Annotation &amp; the Semantic Web&quot; in a number of prior posts</a> which, by the way, should be <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&type=text&kwds=self%20annotation&amp;OpenSearch" id="link-id10b12208">DataRSS compatible right now</a> due to Yahoo&#39;s support of OpenSearch <a href="http://dbpedia.org/resource/Data">Data</a> Providers (which this <a href="http://dbpedia.org/resource/Blog" id="link-id170b8df8">Blog</a> Space has been for eons).</p>

<p>Today, have many communities adding strucuture to the Web (via their respective tools of preference) without explicitly realizing what they are contributing.  Every RSS/Atom feed, <a href="http://dbpedia.org/resource/Tag" id="link-id183d5178">Tag</a>, Weblog, Shared Bookmark, <a href="http://dbpedia.org/resource/WikiWord" id="link-id10c5e758">Wikiword</a>, Microformat, Microformat++ (<a href="http://dbpedia.org/resource/Embedded_RDF" id="link-id16d8ee40">eRDF</a> or <a href="http://dbpedia.org/resource/RDFa" id="link-id1059a688">RDFa</a>),  <a href="http://dbpedia.org/resource/GRDDL" id="link-id1090ae10">GRDDL</a> stylesheet, and RDFizer etc.. is a piece of structured data.</p>

<p>Finally, the different communities are all finding ways to work together (thank heavens!) and the results are going to be cataclysmic when it all plays out :-)</p>

<p>Data, Structure, and Extraction are the keys to the Semantic Life! First you get the Data in a container (<a href="http://dbpedia.org/resource/Information" id="link-id180e5648">information</a> resource), and then you add Structure to the <a href="http://dbpedia.org/resource/Information" id="link-id103801e0">information</a> resource (RSS, Atom, <a href="http://dbpedia.org/resource/Microformats" id="link-id17825e40">microformats</a>, <a href="http://dbpedia.org/resource/RDFa" id="link-id189a8738">RDFa</a>, <a href="http://dbpedia.org/resource/Embedded_RDF" id="link-id1933d5c0">eRDF</a>, SIOC, FOAF,  etc.), once you have Structure RDFization (i.e. transformation to <a href="http://dbpedia.org/resource/Linked_Data" id="link-id19744878">Linked Data</a>) is a synch thanks to <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id180dde30">RDF</a> Middleware (as per <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&type=text&kwds=self%20annotation&amp;OpenSearch" id="link-id16dc3130">earlier RDF middleware posts</a>).</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2008-05-09#1359">
  <rss:title>DBpedia Benchmark Revisited</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-05-09T19:33:42Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">DBpedia Benchmark Revisited We ran the DBpedia benchmark queries again with different configurations of Virtuoso. I had not studied the details of the matter previously but now did have a closer look at the queries. Comparing numbers given by different parties is a constant problem. In the case reported here, we loaded the full DBpedia 3, all languages, with about 198M triples, onto Virtuoso v5 and Virtuoso Cluster v6, all on the same 4 core 2GHz Xeon with 8G RAM. All databases were striped on 6 disks. The Cluster configuration was with 4 processes in the same box. We ran the queries in two variants: With graph specified in the SPARQL FROM clause, using the default indices. With no graph specified anywhere, using an alternate indexing scheme. The times below are for the sequence of 5 queries; individual query times are not reported. I did not do a line-by-line review of the execution plans since they seem to run well enough. We could get some extra mileage from cost model tweaks, especially for the numeric range conditions, but we will do this when somebody comes up with better times. First, about Virtuoso v5: Because there is a query in the set that specifies no condition on S or O and only P, this simply cannot be done with the default indices. With Virtuoso Cluster v6 it sort-of can, because v6 is more space efficient. So we added the index: create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s); Â  Virtuoso v5 with gspo, ogps, pogs Virtuoso Cluster v6 with gspo, ogps Virtuoso Cluster v6 with gspo, ogps, pogs cold 210 s 136 s 33.4 s warm 0.600 s 4.01 s 0.628 s OK, so now let us do it without a graph being specified. For all platforms, we drop any existing indices, and -- create table r2 (g iri_id_8, s, iri_id_8, p iri_id_8, o any, primary key (s, p, o, g)) alter index R2 on R2 partition (s int (0hexffff00)); log_enable (2); insert into r2 (g, s, p, o) select g, s, p, o from rdf_quad; drop table rdf_quad; alter table r2 rename RDF_QUAD; create bitmap index rdf_quad_opgs on rdf_quad (o, p, g, s) partition (o varchar (-1, 0hexffff)); create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s) partition (o varchar (-1, 0hexffff)); create bitmap index rdf_quad_gpos on rdf_quad (g, p, o, s) partition (o varchar (-1, 0hexffff)); The code is identical for v5 and v6, except that with v5 we use iri_id (32 bit) for the type, not iri_id_8 (64 bit). We note that we run out of IDs with v5 around a few billion triples, so with v6 we have double the ID length and still manage to be vastly more space efficient. With the above 4 indices, we can query the data pretty much in any combination without hitting a full scan of any index. We note that all indices that do not begin with s end with s as a bitmap. This takes about 60% of the space of a non-bitmap index for data such as DBpedia. If you intend to do completely arbitrary RDF queries in Virtuoso, then chances are you are best off with the above index scheme. Â  Virtuoso v5 with gspo, ogps, pogs Virtuoso Cluster v6 with spog, pogs, opgs, gpos warm 0.595 s 0.617 s The cold times were about the same as above, so not reproduced. Graph or No Graph? It is in the SPARQL spirit to specify a graph and for pretty much any application, there are entirely sensible ways of keeping the data in graphs and specifying which ones are concerned by queries. This is why Virtuoso is set up for this by default. On the other hand, for the open web scenario, dealing with an unknown large number of graphs, enumerating graphs is not possible and questions like which graph of which source asserts x become relevant. We have two distinct use cases which warrant different setups of the database, simple as that. The latter use case is not really within the SPARQL spec, so implementations may or may not support this. For example Oracle or Vertica would not do this well since they partition data according to graph or predicate, respectively. On the other hand, stores that work with one quad table, which is most of the ones out there, should do it maybe with some configuring, as shown above. Frameworks like Jena are not to my knowledge geared towards having a wildcard for graph, although I would suppose this can be arranged by adding some &quot;super-graph&quot; object, a graph of all graphs. I don&#39;t think this is directly supported and besides most apps would not need it. Once the indices are right, there is no difference between specifying a graph and not specifying a graph with the queries considered. With more complex queries, specifying a graph or set of graphs does allow some optimizations that cannot be done with no graph specified. For example, bitmap intersections are possible only when all leading key parts are given. Conclusions The best warm cache time is with v5; the five queries run under 600 ms after the first go. This is noted to show that all-in-memory with a single thread of execution is hard to beat. Cluster v6 performs the same queries in 623 ms. What is gained in parallelism is lost in latency if all operations complete in microseconds. On the other hand, Cluster v6 leaves v5 in the dust in any situation that has less than 100% hit rate. This is due to actual benefit from parallelism if operations take longer than a few microseconds, such as in the case of disk reads. Cluster v6 has substantially better data layout on disk, as well as fewer pages to load for the same content. This makes it possible to run the queries without the pogs index on Cluster v6 even when v5 takes prohibitively long. The morale of the story is to have a lot of RAM and space-efficient data representation. The DBpedia benchmark does not specify any random access pattern that would give a measure of sustained throughput under load, so we are left with the extremes of cold and warm cache of which neither is quite realistic. Chris Bizer and I have talked on and off about benchmarks and I have made suggestions that we will see incorporated into the Berlin SPARQL benchmark, which will, I believe, be much more informative. Appendix: Query Text For reference, the query texts specifying the graph are below. To run without specifying the graph, just drop the FROM &lt;http://dbpedia.org&gt; from each query. The returned row counts are indicated below each query&#39;s text. sparql SELECT ?p ?o FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/Metropolitan_Museum_of_Art&gt; ?p ?o }; -- 1337 rows sparql PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?film1 ?actor1 ?film2 ?actor2 FROM &lt;http://dbpedia.org&gt; WHERE { ?film1 p:starring &lt;http://dbpedia.org/resource/Kevin_Bacon&gt; . ?film1 p:starring ?actor1 . ?film2 p:starring ?actor1 . ?film2 p:starring ?actor2 . }; -- 23910 rows sparql PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?artist ?artwork ?museum ?director FROM &lt;http://dbpedia.org&gt; WHERE { ?artwork p:artist ?artist . ?artwork p:museum ?museum . ?museum p:director ?director }; -- 303 rows sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt; SELECT ?s ?homepage FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/Berlin&gt; geo:lat ?berlinLat . &lt;http://dbpedia.org/resource/Berlin&gt; geo:long ?berlinLong . ?s geo:lat ?lat . ?s geo:long ?long . ?s foaf:homepage ?homepage . FILTER ( ?lat &lt;= ?berlinLat + 0.03190235436 &amp;&amp; ?long &gt;= ?berlinLong - 0.08679199218 &amp;&amp; ?lat &gt;= ?berlinLat - 0.03190235436 &amp;&amp; ?long &lt;= ?berlinLong + 0.08679199218) }; -- 56 rows sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt; PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?s ?a ?homepage FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/New_York_City&gt; geo:lat ?nyLat . &lt;http://dbpedia.org/resource/New_York_City&gt; geo:long ?nyLong . ?s geo:lat ?lat . ?s geo:long ?long . ?s p:architect ?a . ?a foaf:homepage ?homepage . FILTER ( ?lat &lt;= ?nyLat + 0.3190235436 &amp;&amp; ?long &gt;= ?nyLong - 0.8679199218 &amp;&amp; ?lat &gt;= ?nyLat - 0.3190235436 &amp;&amp; ?long &lt;= ?nyLong + 0.8679199218) }; -- 13 rows</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<div style="display:none;">DBpedia Benchmark Revisited</div>
<p>We ran the <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x1cd6d0c8">DBpedia</a> benchmark queries again with different
configurations of <a href="http://virtuoso.openlinksw.com" id="link-id0x1bf01048">Virtuoso</a>. I had not studied the details of the
matter previously but now did have a closer look at the
queries.</p>
<p>Comparing numbers given by different parties is a constant
problem. In the case reported here, we loaded the full DBpedia 3,
all languages, with about 198M triples, onto Virtuoso v5 and Virtuoso Cluster v6,
all on the same 4 core 2GHz Xeon with 8G RAM. All databases were
striped on 6 disks. The Cluster configuration was with 4 processes
in the same box.</p>
<p>We ran the queries in two variants:</p> 
<ul>
<li>With graph
specified in the <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1b9d3ca0">SPARQL</a> <code>FROM</code> clause, using the default indices.</li>
<li>With no graph specified anywhere, using an
alternate indexing scheme.</li>
</ul>
<p>The times below are for the sequence of 5 queries; individual
query times are not reported. I did not do a line-by-line review of
the execution plans since they seem to run well enough. We could
get some extra mileage from cost model tweaks, especially for the
numeric range conditions, but we will do this when somebody comes up
with better times.</p>
<p>First, about Virtuoso v5: Because there is a query in the set that
specifies no condition on S or O and only P, this simply cannot be
done with the default indices. With Virtuoso Cluster v6 it sort-of can, because v6 is
more space efficient.</p>
<p>So we added the index:</p>
<blockquote>
<code>
create bitmap index <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1c364a58">rdf</a>_quad_pogs on rdf_quad (p, o, g, s);
</code>
</blockquote>

<table>
 <tr>
  <td>Â </td>
  <td align="center"><b>Virtuoso v5 with<br /> gspo, ogps, pogs</b>
  </td>
  <td align="center"><b>Virtuoso Cluster v6 with <br />gspo, ogps</b>
  </td>
  <td align="center"><b>Virtuoso Cluster v6 with <br />gspo, ogps, pogs</b>
  </td>
 </tr>
<tr>
  <td><b>cold</b>
  </td>
  <td align="center">210 s</td>
  <td align="center">136 s</td>
  <td align="center">33.4 s</td>
</tr>
<tr>
  <td><b>warm</b>
  </td>
  <td align="center">0.600 s</td>
  <td align="center">4.01 s</td>
  <td align="center">0.628 s</td>
</tr>
</table>

<p>OK, so now let us do it without a graph being specified. For
all platforms, we drop any existing indices, and --</p>
<blockquote>
<code>
create table r2 (g iri_id_8, s, iri_id_8, p iri_id_8, o any, primary key (s, p, o, g)) <br />
alter index R2 on R2 partition (s int (0hexffff00)); <br />
 <br />
log_enable (2); <br />
insert into r2 (g, s, p, o) select g, s, p, o from rdf_quad; <br />
 <br />
drop table rdf_quad; <br />
alter table r2 rename RDF_QUAD; <br />
create bitmap index rdf_quad_opgs on rdf_quad (o, p, g, s) partition (o varchar (-1, 0hexffff)); <br />
create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s) partition (o varchar (-1, 0hexffff)); <br />
create bitmap index rdf_quad_gpos on rdf_quad (g, p, o, s) partition (o varchar (-1, 0hexffff));
</code>
</blockquote>
<p>The code is identical for v5 and v6, except that with v5 we use
<code>iri_id (32 bit)</code> for the type, not <code>iri_id_8 (64 bit)</code>. We note that
we run out of IDs with v5 around a few billion triples, so with v6
we have double the ID length and still manage to be vastly more
space efficient.</p>
<p>With the above 4 indices, we can query the <a href="http://dbpedia.org/resource/Data" id="link-id0x1bae4cd8">data</a> pretty much in
any combination without hitting a full scan of any index. We note
that all indices that do not begin with s end with s as a bitmap.
This takes about 60% of the space of a non-bitmap index for data such
as DBpedia.</p>
<p>If you intend to do completely arbitrary RDF queries in
Virtuoso, then chances are you are best off with the above index
scheme.</p>

<table>
 <tr>
  <td>Â </td>
  <td align="center"><b> Virtuoso v5 with<br /> gspo, ogps, pogs</b>
  </td>
  <td align="center"><b> Virtuoso Cluster v6 with <br /> spog, pogs, opgs, gpos </b>
  </td>
 </tr>
<tr>
  <td><b>warm</b>
  </td>
  <td align="center">0.595 s</td>
  <td align="center">0.617 s</td>
</tr>
</table>

<p>The cold times were about the same as above, so not
reproduced.</p>
<h3>Graph or No Graph?</h3>
<p>It is in the SPARQL spirit to specify a graph and for pretty
much any application, there are entirely sensible ways of keeping
the data in graphs and specifying which ones are concerned by
queries. This is why Virtuoso is set up for this by default.</p>
<p>On the other hand, for the open web scenario, dealing with an
unknown large number of graphs, enumerating graphs is not possible
and questions like which graph of which source asserts x become
relevant. We have two distinct use cases which warrant different
setups of the database, simple as that.</p>
<p>The latter use case is not really within the SPARQL spec, so
implementations may or may not support this. For example <a href="http://dbpedia.org/resource/Oracle_Database" id="link-id0x1cd2db78">Oracle</a> or
Vertica would not do this well since they partition data according
to graph or predicate, respectively. On the other hand, stores that
work with one quad table, which is most of the ones out there,
should do it maybe with some configuring, as shown above.</p>
<p>Frameworks like Jena are not to my <a href="http://dbpedia.org/resource/Knowledge" id="link-id0x1b300390">knowledge</a> geared towards
having a wildcard for graph, although I would suppose this can be
arranged by adding some &quot;super-graph&quot; object, a graph of all
graphs. I don&#39;t think this is directly supported and besides most
apps would not need it.</p>
<p>Once the indices are right, there is no difference between
specifying a graph and not specifying a graph with the queries considered. With
more complex queries, specifying a graph or set of graphs does
allow some optimizations that cannot be done with no graph specified.
For example, bitmap intersections are possible only when all
leading key parts are given.</p>
<h3>Conclusions</h3>
<p>The best warm cache time is with v5; the five queries run under
600 ms after the first go. This is noted to show that all-in-memory with
a single thread of execution is hard to beat.</p>
<p>Cluster v6 performs the same queries in 623 ms. What is gained in
parallelism is lost in latency if all operations complete in
microseconds. On the other hand, Cluster v6 leaves v5 in the dust in
any situation that has less than 100% hit rate. This is due to
actual benefit from parallelism if operations take longer than a
few microseconds, such as in the case of disk reads. Cluster v6 has
substantially better data layout on disk, as well as fewer pages to
load for the same content.</p>
<p>This makes it possible to run the queries without the pogs
index on Cluster v6 even when v5 takes prohibitively long.</p>
<p>The morale of the story is to have a lot of RAM and space-efficient data representation.</p>
<p>The DBpedia benchmark does not specify any random access
pattern that would give a measure of sustained throughput under
load, so we are left with the extremes of cold and warm cache of
which neither is quite realistic.</p>
<p>Chris Bizer and I have talked on and off about benchmarks and
I have made suggestions that we will see incorporated into the
Berlin SPARQL benchmark, which will, I believe, be much more
informative.</p>
<h3>Appendix: Query Text</h3>
<p>For reference, the query texts specifying the graph are below. To
run without specifying the graph, just drop the <code>FROM
&lt;<a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x1c371db0">http</a>://dbpedia.org&gt;</code> from each query. The returned row counts are indicated
below each query&#39;s text.</p>
<blockquote>
 <code><pre>
sparql SELECT ?p ?o FROM &lt;http://dbpedia.org&gt; WHERE {
  &lt;http://dbpedia.org/resource/Metropolitan_Museum_of_Art&gt; ?p ?o };

-- 1337 rows

sparql PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?film1 ?actor1 ?film2 ?actor2
FROM &lt;http://dbpedia.org&gt; WHERE {
  ?film1 p:starring &lt;http://dbpedia.org/resource/Kevin_Bacon&gt; .
  ?film1 p:starring ?actor1 .
  ?film2 p:starring ?actor1 .
  ?film2 p:starring ?actor2 . };

--  23910 rows

sparql PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?artist ?artwork ?museum ?director FROM &lt;http://dbpedia.org&gt; 
WHERE {
  ?artwork p:artist ?artist .
  ?artwork p:museum ?museum .
  ?museum p:director ?director };

-- 303 rows

sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt;
PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt;
SELECT ?s ?homepage FROM &lt;http://dbpedia.org&gt;  WHERE {
   &lt;http://dbpedia.org/resource/Berlin&gt; geo:lat ?berlinLat .
   &lt;http://dbpedia.org/resource/Berlin&gt; geo:long ?berlinLong . 
   ?s geo:lat ?lat .
   ?s geo:long ?long .
   ?s foaf:homepage ?homepage .
   FILTER (
     ?lat        &lt;=     ?berlinLat + 0.03190235436 &amp;&amp;
     ?long       &gt;=     ?berlinLong - 0.08679199218 &amp;&amp;
     ?lat        &gt;=     ?berlinLat - 0.03190235436 &amp;&amp; 
     ?long       &lt;=     ?berlinLong + 0.08679199218) };

-- 56 rows

sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt;
PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt;
PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?s ?a ?homepage FROM &lt;http://dbpedia.org&gt;  WHERE {
   &lt;http://dbpedia.org/resource/New_York_City&gt; geo:lat ?nyLat .
   &lt;http://dbpedia.org/resource/New_York_City&gt; geo:long ?nyLong . 
   ?s geo:lat ?lat .
   ?s geo:long ?long .
   ?s p:architect ?a .
   ?a foaf:homepage ?homepage .
   FILTER (
     ?lat        &lt;=     ?nyLat + 0.3190235436 &amp;&amp;
     ?long       &gt;=     ?nyLong - 0.8679199218 &amp;&amp;
     ?lat        &gt;=     ?nyLat - 0.3190235436 &amp;&amp; 
     ?long       &lt;=     ?nyLong + 0.8679199218) };

-- 13 rows
</pre>
 </code>
</blockquote>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2008-05-09#1358">
  <rss:title>DBpedia Benchmark Revisited</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-05-09T19:27:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We ran the DBpedia benchmark queries again with different configurations of Virtuoso. I had not studied the details of the matter previously but now did have a closer look at the queries. Comparing numbers given by different parties is a constant problem. In the case reported here, we loaded the full DBpedia 3, all languages, with about 198M triples, onto Virtuoso v5 and Virtuoso Cluster v6, all on the same 4 core 2GHz Xeon with 8G RAM. All databases were striped on 6 disks. The Cluster configuration was with 4 processes in the same box. We ran the queries in two variants: With graph specified in the SPARQL FROM clause, using the default indices. With no graph specified anywhere, using an alternate indexing scheme. The times below are for the sequence of 5 queries; individual query times are not reported. I did not do a line-by-line review of the execution plans since they seem to run well enough. We could get some extra mileage from cost model tweaks, especially for the numeric range conditions, but we will do this when somebody comes up with better times. First, about Virtuoso v5: Because there is a query in the set that specifies no condition on S or O and only P, this simply cannot be done with the default indices. With Virtuoso Cluster v6 it sort-of can, because v6 is more space efficient. So we added the index: create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s); Â  Virtuoso v5 with gspo, ogps, pogs Virtuoso Cluster v6 with gspo, ogps Virtuoso Cluster v6 with gspo, ogps, pogs cold 210 s 136 s 33.4 s warm 0.600 s 4.01 s 0.628 s OK, so now let us do it without a graph being specified. For all platforms, we drop any existing indices, and -- create table r2 (g iri_id_8, s, iri_id_8, p iri_id_8, o any, primary key (s, p, o, g)) alter index R2 on R2 partition (s int (0hexffff00)); log_enable (2); insert into r2 (g, s, p, o) select g, s, p, o from rdf_quad; drop table rdf_quad; alter table r2 rename RDF_QUAD; create bitmap index rdf_quad_opgs on rdf_quad (o, p, g, s) partition (o varchar (-1, 0hexffff)); create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s) partition (o varchar (-1, 0hexffff)); create bitmap index rdf_quad_gpos on rdf_quad (g, p, o, s) partition (o varchar (-1, 0hexffff)); The code is identical for v5 and v6, except that with v5 we use iri_id (32 bit) for the type, not iri_id_8 (64 bit). We note that we run out of IDs with v5 around a few billion triples, so with v6 we have double the ID length and still manage to be vastly more space efficient. With the above 4 indices, we can query the data pretty much in any combination without hitting a full scan of any index. We note that all indices that do not begin with s end with s as a bitmap. This takes about 60% of the space of a non-bitmap index for data such as DBpedia. If you intend to do completely arbitrary RDF queries in Virtuoso, then chances are you are best off with the above index scheme. Â  Virtuoso v5 with gspo, ogps, pogs Virtuoso Cluster v6 with spog, pogs, opgs, gpos warm 0.595 s 0.617 s The cold times were about the same as above, so not reproduced. Graph or No Graph? It is in the SPARQL spirit to specify a graph and for pretty much any application, there are entirely sensible ways of keeping the data in graphs and specifying which ones are concerned by queries. This is why Virtuoso is set up for this by default. On the other hand, for the open web scenario, dealing with an unknown large number of graphs, enumerating graphs is not possible and questions like which graph of which source asserts x become relevant. We have two distinct use cases which warrant different setups of the database, simple as that. The latter use case is not really within the SPARQL spec, so implementations may or may not support this. For example Oracle or Vertica would not do this well since they partition data according to graph or predicate, respectively. On the other hand, stores that work with one quad table, which is most of the ones out there, should do it maybe with some configuring, as shown above. Frameworks like Jena are not to my knowledge geared towards having a wildcard for graph, although I would suppose this can be arranged by adding some &quot;super-graph&quot; object, a graph of all graphs. I don&#39;t think this is directly supported and besides most apps would not need it. Once the indices are right, there is no difference between specifying a graph and not specifying a graph with the queries considered. With more complex queries, specifying a graph or set of graphs does allow some optimizations that cannot be done with no graph specified. For example, bitmap intersections are possible only when all leading key parts are given. Conclusions The best warm cache time is with v5; the five queries run under 600 ms after the first go. This is noted to show that all-in-memory with a single thread of execution is hard to beat. Cluster v6 performs the same queries in 623 ms. What is gained in parallelism is lost in latency if all operations complete in microseconds. On the other hand, Cluster v6 leaves v5 in the dust in any situation that has less than 100% hit rate. This is due to actual benefit from parallelism if operations take longer than a few microseconds, such as in the case of disk reads. Cluster v6 has substantially better data layout on disk, as well as fewer pages to load for the same content. This makes it possible to run the queries without the pogs index on Cluster v6 even when v5 takes prohibitively long. The morale of the story is to have a lot of RAM and space-efficient data representation. The DBpedia benchmark does not specify any random access pattern that would give a measure of sustained throughput under load, so we are left with the extremes of cold and warm cache of which neither is quite realistic. Chris Bizer and I have talked on and off about benchmarks and I have made suggestions that we will see incorporated into the Berlin SPARQL benchmark, which will, I believe, be much more informative. Appendix: Query Text For reference, the query texts specifying the graph are below. To run without specifying the graph, just drop the FROM &lt;http://dbpedia.org&gt; from each query. The returned row counts are indicated below each query&#39;s text. sparql SELECT ?p ?o FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/Metropolitan_Museum_of_Art&gt; ?p ?o }; -- 1337 rows sparql PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?film1 ?actor1 ?film2 ?actor2 FROM &lt;http://dbpedia.org&gt; WHERE { ?film1 p:starring &lt;http://dbpedia.org/resource/Kevin_Bacon&gt; . ?film1 p:starring ?actor1 . ?film2 p:starring ?actor1 . ?film2 p:starring ?actor2 . }; -- 23910 rows sparql PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?artist ?artwork ?museum ?director FROM &lt;http://dbpedia.org&gt; WHERE { ?artwork p:artist ?artist . ?artwork p:museum ?museum . ?museum p:director ?director }; -- 303 rows sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt; SELECT ?s ?homepage FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/Berlin&gt; geo:lat ?berlinLat . &lt;http://dbpedia.org/resource/Berlin&gt; geo:long ?berlinLong . ?s geo:lat ?lat . ?s geo:long ?long . ?s foaf:homepage ?homepage . FILTER ( ?lat &lt;= ?berlinLat + 0.03190235436 &amp;&amp; ?long &gt;= ?berlinLong - 0.08679199218 &amp;&amp; ?lat &gt;= ?berlinLat - 0.03190235436 &amp;&amp; ?long &lt;= ?berlinLong + 0.08679199218) }; -- 56 rows sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt; PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt; PREFIX p: &lt;http://dbpedia.org/property/&gt; SELECT ?s ?a ?homepage FROM &lt;http://dbpedia.org&gt; WHERE { &lt;http://dbpedia.org/resource/New_York_City&gt; geo:lat ?nyLat . &lt;http://dbpedia.org/resource/New_York_City&gt; geo:long ?nyLong . ?s geo:lat ?lat . ?s geo:long ?long . ?s p:architect ?a . ?a foaf:homepage ?homepage . FILTER ( ?lat &lt;= ?nyLat + 0.3190235436 &amp;&amp; ?long &gt;= ?nyLong - 0.8679199218 &amp;&amp; ?lat &gt;= ?nyLat - 0.3190235436 &amp;&amp; ?long &lt;= ?nyLong + 0.8679199218) }; -- 13 rows</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>We ran the <a href="http://dbpedia.org/resource/DBpedia" id="link-id0x1b7f9688">DBpedia</a> benchmark queries again with different
configurations of <a href="http://virtuoso.openlinksw.com" id="link-id0x1cca2e00">Virtuoso</a>. I had not studied the details of the
matter previously but now did have a closer look at the
queries.</p>
<p>Comparing numbers given by different parties is a constant
problem. In the case reported here, we loaded the full DBpedia 3,
all languages, with about 198M triples, onto Virtuoso v5 and Virtuoso Cluster v6,
all on the same 4 core 2GHz Xeon with 8G RAM. All databases were
striped on 6 disks. The Cluster configuration was with 4 processes
in the same box.</p>
<p>We ran the queries in two variants:</p> 
<ul>
<li>With graph
specified in the <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1b77f758">SPARQL</a> <code>FROM</code> clause, using the default indices.</li>
<li>With no graph specified anywhere, using an
alternate indexing scheme.</li>
</ul>
<p>The times below are for the sequence of 5 queries; individual
query times are not reported. I did not do a line-by-line review of
the execution plans since they seem to run well enough. We could
get some extra mileage from cost model tweaks, especially for the
numeric range conditions, but we will do this when somebody comes up
with better times.</p>
<p>First, about Virtuoso v5: Because there is a query in the set that
specifies no condition on S or O and only P, this simply cannot be
done with the default indices. With Virtuoso Cluster v6 it sort-of can, because v6 is
more space efficient.</p>
<p>So we added the index:</p>
<blockquote>
<code>
create bitmap index <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1cb0b180">rdf</a>_quad_pogs on rdf_quad (p, o, g, s);
</code>
</blockquote>

<table>
 <tr>
  <td>Â </td>
  <td align="center"><b>Virtuoso v5 with<br /> gspo, ogps, pogs</b>
  </td>
  <td align="center"><b>Virtuoso Cluster v6 with <br />gspo, ogps</b>
  </td>
  <td align="center"><b>Virtuoso Cluster v6 with <br />gspo, ogps, pogs</b>
  </td>
 </tr>
<tr>
  <td><b>cold</b>
  </td>
  <td align="center">210 s</td>
  <td align="center">136 s</td>
  <td align="center">33.4 s</td>
</tr>
<tr>
  <td><b>warm</b>
  </td>
  <td align="center">0.600 s</td>
  <td align="center">4.01 s</td>
  <td align="center">0.628 s</td>
</tr>
</table>

<p>OK, so now let us do it without a graph being specified. For
all platforms, we drop any existing indices, and --</p>
<blockquote>
<code>
create table r2 (g iri_id_8, s, iri_id_8, p iri_id_8, o any, primary key (s, p, o, g)) <br />
alter index R2 on R2 partition (s int (0hexffff00)); <br />
 <br />
log_enable (2); <br />
insert into r2 (g, s, p, o) select g, s, p, o from rdf_quad; <br />
 <br />
drop table rdf_quad; <br />
alter table r2 rename RDF_QUAD; <br />
create bitmap index rdf_quad_opgs on rdf_quad (o, p, g, s) partition (o varchar (-1, 0hexffff)); <br />
create bitmap index rdf_quad_pogs on rdf_quad (p, o, g, s) partition (o varchar (-1, 0hexffff)); <br />
create bitmap index rdf_quad_gpos on rdf_quad (g, p, o, s) partition (o varchar (-1, 0hexffff));
</code>
</blockquote>
<p>The code is identical for v5 and v6, except that with v5 we use
<code>iri_id (32 bit)</code> for the type, not <code>iri_id_8 (64 bit)</code>. We note that
we run out of IDs with v5 around a few billion triples, so with v6
we have double the ID length and still manage to be vastly more
space efficient.</p>
<p>With the above 4 indices, we can query the <a href="http://dbpedia.org/resource/Data" id="link-id0x6339b80">data</a> pretty much in
any combination without hitting a full scan of any index. We note
that all indices that do not begin with s end with s as a bitmap.
This takes about 60% of the space of a non-bitmap index for data such
as DBpedia.</p>
<p>If you intend to do completely arbitrary RDF queries in
Virtuoso, then chances are you are best off with the above index
scheme.</p>

<table>
 <tr>
  <td>Â </td>
  <td align="center"><b> Virtuoso v5 with<br /> gspo, ogps, pogs</b>
  </td>
  <td align="center"><b> Virtuoso Cluster v6 with <br /> spog, pogs, opgs, gpos </b>
  </td>
 </tr>
<tr>
  <td><b>warm</b>
  </td>
  <td align="center">0.595 s</td>
  <td align="center">0.617 s</td>
</tr>
</table>

<p>The cold times were about the same as above, so not
reproduced.</p>
<h3>Graph or No Graph?</h3>
<p>It is in the SPARQL spirit to specify a graph and for pretty
much any application, there are entirely sensible ways of keeping
the data in graphs and specifying which ones are concerned by
queries. This is why Virtuoso is set up for this by default.</p>
<p>On the other hand, for the open web scenario, dealing with an
unknown large number of graphs, enumerating graphs is not possible
and questions like which graph of which source asserts x become
relevant. We have two distinct use cases which warrant different
setups of the database, simple as that.</p>
<p>The latter use case is not really within the SPARQL spec, so
implementations may or may not support this. For example <a href="http://dbpedia.org/resource/Oracle_Database" id="link-id0x11ed7028">Oracle</a> or
Vertica would not do this well since they partition data according
to graph or predicate, respectively. On the other hand, stores that
work with one quad table, which is most of the ones out there,
should do it maybe with some configuring, as shown above.</p>
<p>Frameworks like Jena are not to my <a href="http://dbpedia.org/resource/Knowledge" id="link-id0x1a49ded0">knowledge</a> geared towards
having a wildcard for graph, although I would suppose this can be
arranged by adding some &quot;super-graph&quot; object, a graph of all
graphs. I don&#39;t think this is directly supported and besides most
apps would not need it.</p>
<p>Once the indices are right, there is no difference between
specifying a graph and not specifying a graph with the queries considered. With
more complex queries, specifying a graph or set of graphs does
allow some optimizations that cannot be done with no graph specified.
For example, bitmap intersections are possible only when all
leading key parts are given.</p>
<h3>Conclusions</h3>
<p>The best warm cache time is with v5; the five queries run under
600 ms after the first go. This is noted to show that all-in-memory with
a single thread of execution is hard to beat.</p>
<p>Cluster v6 performs the same queries in 623 ms. What is gained in
parallelism is lost in latency if all operations complete in
microseconds. On the other hand, Cluster v6 leaves v5 in the dust in
any situation that has less than 100% hit rate. This is due to
actual benefit from parallelism if operations take longer than a
few microseconds, such as in the case of disk reads. Cluster v6 has
substantially better data layout on disk, as well as fewer pages to
load for the same content.</p>
<p>This makes it possible to run the queries without the pogs
index on Cluster v6 even when v5 takes prohibitively long.</p>
<p>The morale of the story is to have a lot of RAM and space-efficient data representation.</p>
<p>The DBpedia benchmark does not specify any random access
pattern that would give a measure of sustained throughput under
load, so we are left with the extremes of cold and warm cache of
which neither is quite realistic.</p>
<p>Chris Bizer and I have talked on and off about benchmarks and
I have made suggestions that we will see incorporated into the
Berlin SPARQL benchmark, which will, I believe, be much more
informative.</p>
<h3>Appendix: Query Text</h3>
<p>For reference, the query texts specifying the graph are below. To
run without specifying the graph, just drop the <code>FROM
&lt;<a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x1905bfd0">http</a>://dbpedia.org&gt;</code> from each query. The returned row counts are indicated
below each query&#39;s text.</p>
<blockquote>
 <code><pre>
sparql SELECT ?p ?o FROM &lt;http://dbpedia.org&gt; WHERE {
  &lt;http://dbpedia.org/resource/Metropolitan_Museum_of_Art&gt; ?p ?o };

-- 1337 rows

sparql PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?film1 ?actor1 ?film2 ?actor2
FROM &lt;http://dbpedia.org&gt; WHERE {
  ?film1 p:starring &lt;http://dbpedia.org/resource/Kevin_Bacon&gt; .
  ?film1 p:starring ?actor1 .
  ?film2 p:starring ?actor1 .
  ?film2 p:starring ?actor2 . };

--  23910 rows

sparql PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?artist ?artwork ?museum ?director FROM &lt;http://dbpedia.org&gt; 
WHERE {
  ?artwork p:artist ?artist .
  ?artwork p:museum ?museum .
  ?museum p:director ?director };

-- 303 rows

sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt;
PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt;
SELECT ?s ?homepage FROM &lt;http://dbpedia.org&gt;  WHERE {
   &lt;http://dbpedia.org/resource/Berlin&gt; geo:lat ?berlinLat .
   &lt;http://dbpedia.org/resource/Berlin&gt; geo:long ?berlinLong . 
   ?s geo:lat ?lat .
   ?s geo:long ?long .
   ?s foaf:homepage ?homepage .
   FILTER (
     ?lat        &lt;=     ?berlinLat + 0.03190235436 &amp;&amp;
     ?long       &gt;=     ?berlinLong - 0.08679199218 &amp;&amp;
     ?lat        &gt;=     ?berlinLat - 0.03190235436 &amp;&amp; 
     ?long       &lt;=     ?berlinLong + 0.08679199218) };

-- 56 rows

sparql PREFIX geo: &lt;http://www.w3.org/2003/01/geo/wgs84_pos#&gt;
PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt;
PREFIX p: &lt;http://dbpedia.org/property/&gt;
SELECT ?s ?a ?homepage FROM &lt;http://dbpedia.org&gt;  WHERE {
   &lt;http://dbpedia.org/resource/New_York_City&gt; geo:lat ?nyLat .
   &lt;http://dbpedia.org/resource/New_York_City&gt; geo:long ?nyLong . 
   ?s geo:lat ?lat .
   ?s geo:long ?long .
   ?s p:architect ?a .
   ?a foaf:homepage ?homepage .
   FILTER (
     ?lat        &lt;=     ?nyLat + 0.3190235436 &amp;&amp;
     ?long       &gt;=     ?nyLong - 0.8679199218 &amp;&amp;
     ?lat        &gt;=     ?nyLat - 0.3190235436 &amp;&amp; 
     ?long       &lt;=     ?nyLong + 0.8679199218) };

-- 13 rows
</pre>
 </code>
</blockquote>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-30#1352">
  <rss:title>Clearing Up RDF misrepresentation once again!</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-04-30T15:51:17Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Daniel Lewis has penned a post titled: Clearing up some misconceptions..again, in response to Ben Werdmuller&#39;s post titled: Introducing the Open Data Definition. The great thing about the Linked Data Web is that it&#39;s much easier to discovery and respond to these points of view before the ink dries :-) Ben certainly needs to take a look at the Semantic Web FAQ pre or post assimilation of Daniel&#39;s response.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://myopenlink.net/dataspace/person/danieljohnlewis#this" id="link-id12d57690">Daniel Lewis</a> has penned a post titled: <a href="http://vanirsystems.com/danielsblog/2008/04/30/clearing-up-some-misconceptions-again/" id="link-id10c99f18">Clearing up some misconceptions..again</a>, in response to <a href="http://elgg.org/bwerdmuller/foaf#elgg2" id="link-id14fe1bc8">Ben Werdmuller</a>&#39;s post titled: <a href="http://blogs.zdnet.com/social/?p=477" id="link-id141cee58">Introducing the Open Data Definition</a>. </p>
<p>The great thing about the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id105991a8">Linked Data</a> <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id10a6ec78">Web</a> is that it&#39;s much easier to discovery and respond to these points of view before the ink dries :-) Ben certainly needs to take a look at the <a href="http://www.w3.org/RDF/FAQ" id="link-id10f78958">Semantic Web FAQ</a> pre or post assimilation of Daniel&#39;s response.</p>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-15#1341">
  <rss:title>Explaining the Granular Social Network</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-04-15T21:03:54Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Courtesy of Thomas Vander Wal&#39;s interesting blog post titled: Explaining the Granular Social Network, I found a nice video that highlights the Who + What you know aspect of Social Networking ad the GGG in general. As I can&#39;t quite remix Videos on the spur of the moment (yet), I would encourage you to watch the video and then click on the link to my FOAF Profile, then follow the &quot;Linked Data&quot; tab to see how Linked Data oriented platforms (in my case OpenLink Data Spaces) that exist today actually deliver what&#39;s explained in the video. &quot;What You Know&quot; (Data &amp; Friend Networks) ultimately trumps &quot;Who You Know&quot; (Friend only Networks). The exploitation power of this reality is enhanced exponentially via the Linked Data Web once the implications of beaming SPARQL queries down specific URIs (entry points to Linked Data graphs) become clearer :-)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Courtesy of <a href="http://www.vimeo.com/user321809/l:embed_898144" id="link-id10c725a8">Thomas Vander Wal</a>&#39;s interesting <a href="http://dbpedia.org/resource/Blog" id="link-id142dfb90">blog</a> post titled: Explaining the Granular Social Network, I found a nice video that highlights the Who + What you know aspect of Social Networking ad the <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-id1054bc58">GGG</a> in general. </p>

<p>As I can&#39;t quite remix Videos on the spur of the moment (yet), I would encourage you to watch the video and then click on the link to <a href="http://myopenlink.net/dataspace/person/kidehen" id="link-id130b7410">my FOAF Profile</a>, then follow the &quot;<a href="http://dbpedia.org/resource/Linked_Data" id="link-id18485a48">Linked Data</a>&quot; tab to see how <a href="http://dbpedia.org/resource/Linked_Data" id="link-id14070380">Linked Data</a> oriented platforms (in my case <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id10a30f60">OpenLink Data Spaces</a>) that exist today actually deliver what&#39;s explained in the video.
</p>
<p>&quot;What You Know&quot; (<a href="http://www.jasonkolb.com/weblog/2008/03/users-as-data-c.html" id="link-id140f4e28">Data &amp; Friend Networks</a>)  ultimately trumps &quot;Who You Know&quot; (Friend only Networks). The exploitation power of this reality is enhanced exponentially via the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xdcf0460">Linked Data</a> <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id0xa008f990">Web</a> once the implications of beaming <a href="http://dbpedia.org/resource/SPARQL" id="link-idfdfa2f0">SPARQL</a> queries down specific URIs (entry points to <a href="http://dbpedia.org/resource/Linked_Data" id="link-id15ce0dc0">Linked Data</a> graphs) become clearer :-)</p>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-09#1333">
  <rss:title>Adding Wordpress Blogs into the Linked Data Web using Virtuoso</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-04-09T21:27:34Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Wordpress is a Weblog platform comprised of the following: User Interface - PHP Application Logic - PHP Data Storage (SQL RDBMS) - MySQL via PHP-MySQL Application Server - Apache In the form above (the norm), Wordpress data can be injected into the Linked Data Web via RDFization middleware such as theVirtuoso Sponger (built into all Virtuoso instances) and Triplr. The downside of this approach is that the blog owner doesn&#39;t necessary possess full control over their contributions to the emerging Giant Global Graph or Linked Data. Another route to Linked Data exposure is via Virtuoso&#39;s Metaschema Language for producing RDF Views over ODBC/JDBC accessible Data Sources, that enables the following setup: User Interface - PHP Application Logic - PHP Data Storage (SQL RDBMS) - MySQL via the PHP-MySQL data access interface Virtual Database linkage of MySQL Tables into Virtuoso RDF View generated over the Virtual SQL Tables Application Server - Virtuoso which provides Linked Data Deployment such that RDF Linked Data is exposed when requested by Web User Agents. Alternatively, you can also exploit Virtuoso as the SQL DBMS, RDF DBMS, Application Server, and Linked Data Deployment platform: User Interface - PHP Application Logic - PHP Data Storage (SQL RDBMS) - Virtuoso via PHP-ODBC data access interface (* ODBC is Virtuoso&#39;s native SQL CLI/API *) RDF View generated over the Native SQL Tables Application Server - Virtuoso which provides Linked Data Deployment such that RDF Linked Data is exposed when requested by Web User Agents (e.g. OpenLink RDF Browser, Zitgist Data Viewer, DISCO Hyperdata Browser, and Tabulator). Benefits? Each user account gets a proper Linked Data URI (ID) that can me meshed/smushed with other IDs (so you add data from this new blog space to other linked data sources associated with you other URIs/IDs) Each post gets a proper URI All data is now query-able via SPARQL Discoverability increases exponentially (without drop in relevance in either direction i.e. discovering or being discovered) How Do I map the WordPress SQL Schema to RDF using Virtuoso? Determine the RDF Schema or Ontologies that define the Classes for which you will be producing instance data (e.g. SIOC and FOAF) Declare URI/IRI generator functions (*special Virtuoso functions*) Use SPARQL Graph patterns to apply URI/IRI generator functions to Tables, Views, Table Values mode Stored Procedures, Query Resultsets as part of RDBMS to RDF mapping Read the Meta Schema Language guide or simply apply our &quot;WordPress SQL Schema to RDF&quot; script to your Virtuoso hosted instance. Of course, there are other mappings that cover other PHP applications deployed via Virtuoso: phpBB3 SQL Schema to RDF Drupal SQL Schema to RDF MediaWiki SQL Schema to RDF Live Demos? Virtuoso Hosting phpBB3 (example User URI) Virtuoso Hosting Drupal (example User URI) Virtuoso Hosting MediaWiki (example User URI)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://dbpedia.org/resource/WordPress" id="link-id101103b0">Wordpress</a> is a Weblog platform comprised of the following: </p>
<ol>
  <li>User Interface - <a href="http://dbpedia.org/resource/PHP" id="link-id107ba368">PHP</a>
</li>
  <li>Application Logic - <a href="http://dbpedia.org/resource/PHP" id="link-id107066b8">PHP</a> </li>
  <li>
  <a href="http://dbpedia.org/resource/Data" id="link-id13968340">Data</a> Storage (<a href="http://dbpedia.org/resource/SQL" id="link-id104c5350">SQL</a> <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id1076d790">RDBMS</a>) - <a href="http://dbpedia.org/resource/MySQL" id="link-id109c4ea0">MySQL</a> via <a href="http://dbpedia.org/resource/PHP" id="link-id133af570">PHP</a>-<a href="http://dbpedia.org/resource/MySQL" id="link-idf0b03b0">MySQL</a> </li>
  <li>
  <a href="http://dbpedia.org/resource/Application_server" id="link-id13217630">Application Server</a> - <a href="http://dbpedia.org/resource/Apache" id="link-id108219d8">Apache</a>
</li>
</ol>
<p>In the form above (the norm), <a href="http://dbpedia.org/resource/WordPress" id="link-id105c6d88">Wordpress</a> <a href="http://dbpedia.org/resource/Data" id="link-id104938f8">data</a> can be injected into the <a href="http://dbpedia.org/resource/Linked_Data" id="link-id107a5f18">Linked Data</a> <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id177329c0">Web</a> via RDFization middleware such as the<a href="http://virtuoso.openlinksw.com/Whitepapers/html/VirtSpongerWhitePaper.html" id="link-id10531b50">Virtuoso Sponger</a> (built into all <a href="http://virtuoso.openlinksw.com" id="link-id10d7e710">Virtuoso</a> instances) and <a href="http://triplr.org/" id="link-id107dcab8">Triplr</a>. The downside of this approach is that the <a href="http://dbpedia.org/resource/Blog" id="link-id1055ab68">blog</a> owner doesn&#39;t necessary possess full control over their contributions to the emerging <a href="http://dbpedia.org/resource/Giant_Global_Graph" id="link-idfed0358">Giant Global Graph</a> or <a href="http://dbpedia.org/resource/Linked_Data" id="link-id10d70668">Linked Data</a>.</p>
<p>Another route to <a href="http://dbpedia.org/resource/Linked_Data" id="link-id104c7f68">Linked Data</a> exposure is via <a href="http://virtuoso.openlinksw.com" id="link-id0xa255fb50">Virtuoso</a>&#39;s Metaschema Language for producing <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id10968388">RDF</a> Views over <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id13f594c8">ODBC</a>/<a href="http://dbpedia.org/resource/Java_Database_Connectivity" id="link-id138f69a8">JDBC</a> accessible <a href="http://dbpedia.org/resource/Data" id="link-id1393c068">Data</a> Sources, that enables the following setup:</p>
<ol>
  <li>User Interface - <a href="http://dbpedia.org/resource/PHP" id="link-id0x9fb9c478">PHP</a> </li>
  <li>Application Logic - PHP  </li>
  <li>
  <a href="http://dbpedia.org/resource/Data" id="link-id0xc605960">Data</a> Storage (<a href="http://dbpedia.org/resource/SQL" id="link-id0xc2be608">SQL</a> <a href="http://dbpedia.org/resource/Relational_database_management_system" id="link-id0xc7a28a8">RDBMS</a>) - <a href="http://dbpedia.org/resource/MySQL" id="link-id0xc7228f0">MySQL</a> via the PHP-MySQL <a href="http://dbpedia.org/resource/Data">data</a> access interface  </li>
  <li>
  <a href="http://dbpedia.org/resource/Virtual_Database" id="link-id134b1ee8">Virtual Database</a> linkage of MySQL Tables into Virtuoso  </li>
  <li>
  <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-idfe31548">RDF</a> View generated over the Virtual SQL Tables  </li>
  <li>
  <a href="http://dbpedia.org/resource/Application_server" id="link-id0xb8dfa68">Application Server</a> - Virtuoso which provides <a href="http://dbpedia.org/resource/Linked_Data" id="link-id0xc149518">Linked Data</a> Deployment such that <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id10ad9ca0">RDF</a> <a href="http://dbpedia.org/resource/Linked_Data">Linked Data</a> is exposed when requested by <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-idfd352e0">Web</a> User Agents.</li>
</ol>
<p>Alternatively, you can also exploit Virtuoso as the SQL DBMS, <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x9ec4f440">RDF</a> DBMS, Application Server, and Linked Data Deployment platform:</p>
<ol>
  <li>User Interface - PHP
  </li>
  <li> Application Logic - PHP                                </li>
  <li>Data Storage (SQL RDBMS) - Virtuoso via PHP-<a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id14197218">ODBC</a> data access interface (* <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id103d1a80">ODBC</a> is Virtuoso&#39;s native SQL CLI/API *)                                  </li>
  <li><a href="http://dbpedia.org/resource/Resource_Description_Framework">RDF</a> View generated over the Native SQL Tables                                    </li>
  <li>Application Server - Virtuoso which provides Linked Data Deployment such that RDF Linked Data is exposed when requested by <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id13918d68">Web</a> User Agents (e.g. <a href="http://demo.openlinksw.com/rdfbrowser" id="link-idff835f0">OpenLink RDF Browser</a>, <a href="http://zitgist.com/about/" id="link-id1372e510">Zitgist</a> <a href="http://dataviewer.zitgist.com" id="link-id109c3048">Data Viewer</a>, <a href="http://www4.wiwiss.fu-berlin.de/rdf_browser" id="link-id105d97f0">DISCO Hyperdata Browser</a>, and <a href="http://dig.csail.mit.edu/2005/ajar/release/tabulator/0.8/tab.html" id="link-id10cc20d8">Tabulator</a>). </li>
</ol>

  <h2 align="left">Benefits?</h2>
  <ul>
    <li>Each user account gets a proper Linked Data <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id108c92b0">URI</a> (ID) that can me meshed/smushed with other IDs (so you add data from this new <a href="http://dbpedia.org/resource/Blog" id="link-idfd39648">blog</a> space to other linked data sources associated with you other URIs/IDs)
    </li>
    <li>Each post gets a proper <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id10add540">URI</a>
                                  All data is now query-able via <a href="http://dbpedia.org/resource/SPARQL" id="link-id101b98f0">SPARQL</a>
                                  Discoverability increases exponentially (without drop in relevance in either direction i.e. discovering or being discovered)</li>
  </ul>
  <p>How Do I map the <a href="http://dbpedia.org/resource/WordPress" id="link-id12e448c0">WordPress</a> SQL Schema to RDF using Virtuoso?    </p>
  <ul>
    <li>Determine the RDF Schema or Ontologies that define the Classes for which you will be producing instance data (e.g. SIOC and FOAF)    </li>
    <li>Declare <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-idfaf5c80">URI</a>/IRI generator functions (*special Virtuoso functions*)    </li>
    <li>Use <a href="http://dbpedia.org/resource/SPARQL" id="link-id100436b8">SPARQL</a> Graph patterns to apply <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0x9de74950">URI</a>/IRI generator functions to Tables, Views, Table Values mode Stored Procedures, Query Resultsets as part of RDBMS to RDF mapping </li>
  </ul>
  <p> Read the <a href="http://virtuoso.openlinksw.com/dataspace/dav/wiki/Main/VOSSQL2RDF" id="link-idfaf5d58">Meta Schema Language guide</a> or simply apply our &quot;<a href="http://dbpedia.org/resource/WordPress" id="link-id0x9ef73c78">WordPress</a> SQL Schema to RDF&quot; script to your Virtuoso hosted instance.
                                                        
Of course, there are other mappings that cover other PHP applications deployed via Virtuoso:</p>
  <ul>
    <li> <a href="http://dbpedia.org/resource/PhpBB" id="link-id179f4870">phpBB3</a> SQL Schema to RDF </li>
    <li>
  <a href="http://dbpedia.org/resource/Drupal" id="link-id10b263d8">Drupal</a> SQL Schema to RDF </li>
    <li>
  <a href="http://dbpedia.org/resource/MediaWiki" id="link-id10263a40">MediaWiki</a> SQL Schema to RDF </li>
  </ul>
<h2>Live Demos?</h2>
  <ul>
    <li>
  <a href="http://demo.openlinksw.com/phpBB3" id="link-id17761e88">Virtuoso Hosting phpBB3</a> (<a href="http://demo.openlinksw.com/phpBB3/user/demo#this" id="link-id10087e68">example User URI</a>)</li>
    <li>
  <a href="http://demo.openlinksw.com/drupal" id="link-id1091f1d8">Virtuoso Hosting Drupal</a> (<a href="http://demo.openlinksw.com/drupal/user/demo#this" id="link-id13e3d468">example User URI</a>)</li>
    <li>
  <a href="http://demo.openlinksw.com/mediawiki" id="link-id10531be0">Virtuoso Hosting MediaWiki</a> (<a href="http://demo.openlinksw.com/mediawiki/user/KingsleyIdehen#this" id="link-id109c5d40">example User URI</a>)</li>
  </ul>
  ]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-04-09#1332">
  <rss:title>Recent Data Portability, Linked Data, and Open Data Access Podcasts</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-04-09T17:15:56Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I just listen to, and very much enjoyed (lots of chuckling) Dave Beckett&#39;s podcast interview on the Talis podcast network. Clearly Dave has a bent for funny project names etc.. He also introduced &quot;Inter-Webs&quot; (Web Data Spaces in my parlance) towards the end of the interview. Trent Adams, Steve Greenberg, and I, also had a podcast chat about Web Data Portability and Accessibility (Linked Data). I also remixed Jon Breslin&#39;s &quot;Data Portability &amp; Me&quot; presentation to produce: &quot;Data Accessibility &amp; Me&quot;. The podcasts interviews and presentations provide contributions to the broadening discourse about Open Data Access / Connectivity on the Web.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>I just listen to, and very much enjoyed (lots of chuckling) <a href="http://www.dajobe.org/" id="link-id177310c8">Dave Beckett</a>&#39;s podcast interview on the <a href="http://talk.talis.com/" id="link-id1056ec98">Talis podcast network</a>. Clearly Dave has a bent for funny project names etc.. He also introduced &quot;Inter-Webs&quot; (<a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> <a href="http://dbpedia.org/resource/Data">Data</a> Spaces in my parlance) towards the end of the interview.</p>

<p>
<a href="http://www.mediaslate.org/wp/about/" id="link-idfc558f0">Trent Adams</a>, <a href="http://www.linkedin.com/pub/0/49b/4b5" id="link-id107137b0">Steve Greenberg</a>, and I, also had a podcast chat about <a href="http://www.mediaslate.org/wp/2008/03/29/dataportability-in-motion-podcast/" id="link-id10663ec8">Web Data Portability and Accessibility (Linked Data)</a>. I also remixed <a href="http://www.johnbreslin.com/" id="link-id104617f0">Jon Breslin</a>&#39;s &quot;<a href="http://www.slideshare.net/Cloud/dataportability-and-me-introducing-sioc-foaf-and-the-semantic-web/" id="link-id12ca2c70">Data Portability &amp; Me</a>&quot; presentation to produce: &quot;<a href="http://www.slideshare.net/Cloud/data-accessibility-and-me-introducing-sioc-foaf-and-the-linked-data-web/" id="link-idfdf0cd8">Data Accessibility &amp; Me</a>&quot;.
</p>
<p>The podcasts interviews and presentations provide contributions to the broadening discourse about Open Data Access / Connectivity on the Web.</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-03-12#1323">
  <rss:title>So, What Does &quot;HREF&quot; Stand For, Anyway</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-03-12T16:08:46Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">As per usual I am writing this post with the aim of killing a number of meme-birds with a single post in relation to the emerging Linked Data Web. *On* the ubiquitous Web of &quot;Linked Documents&quot;, HREF means (by definition and usage): Hypertext Reference to an HTTP accessible Data Object of Type: &quot;Document&quot; (an information resource). Of course we don&#39;t make the formal connection of Object Type when dealing with the Web on a daily basis, but whenever you encounter the &quot;resource not found&quot; condition notice the message: HTTP/1.0 404 Object Not Found, from the HTTP Server tasked with retrieving and returning the resource. *In* the Web of &quot;Linked Data&quot;, a complimentary addition to the current Web of &quot;Linked Documents&quot;, HREF is used to reference Data Objects that are of a variety of &quot;Types&quot;, not just &quot;Documents&quot;. And the way this is achieved, is by using Data Object Identifiers (URIs / IRIs that are generated by the Linked Data deployment platform) in the strict sense i.e. Data Identity (URI) is separated from Data Address (URL). Thus, you can reference a Person Data Object (aka an instance of a Person Class) in your HREF and the HTTP Server returns a Description of the Data Object via a Document (again, an information resource). A document containing the Description of a Data Object typically contains HREFs to other Data Objects that expose the Attributes and Relationships of the initial Person Data Object, and it this collection of Data Objects that is technically called a &quot;Graph&quot; -- which is what RDF models. What I describe above is basic stuff for anyone that&#39;s familiar with Object Database or Distributed Objects technology and concepts. URI and URL confusion The Linked Document Web is a collection of physical resources that traverse the Web Information Bus in palatable format i.e documents. Thus, Document Object Identity and Document Object Data Address can be the same thing i.e. a URL can serve as the ID/URI of a Document Data Object. The Linked Data Web on the other hand, is a Distributed Object Database, and each Data Object must be uniquely defined, otherwise we introduce ambiguity that ultimately taints the Database itself (making incomprehensible to reasoning challenged machines). Thus we must have unique Object IDs (URIs / IRIs) for People, Places, Events, and other things that aren&#39;t Documents. Once we follow the time tested rules of Identity, People can then be associated with the things they create (blog posts, web pages, bookmarks, wikiwords etc). RDF is about expressing these graph model relationships while RDF serialization formats enables the information resources to transport these data object link ladden information resources to requesting User Agents. Put in more succinct terms, all documents on the Web are compound documents in reality (e.g. mast contain a least an image these days). The Linked Data Web is about a Web where Data Object IDs (URIs) enable us to distill source data from the information contained in a compound document. Examples: &lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; - the ID (URI minted from URL via addition of #this) of a Data Object of Type Person that Identifies me. The Person definition I use comes from the FOAF vocabulary/schema/ontology/data dictionary &lt;http://community.linkeddata.org/dataspace/person/kidehen2&gt; - the URI (also a URL) of a FOAF file that contains a description of the Data Object ID: &lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; (me) As an information resource &lt;http://community.linkeddata.org/dataspace/person/kidehen2&gt; can be dispatched from an HTTP server to a User Agent in (X)HTML, RDF/XML, N3/Turtle representations via HTTP Content Negotiation (note: Look at the &quot;Linked Data&quot; tab to see one example of what Data Links facilitate re. Data Discovery and Exploration) If I choose an Object ID of &lt;http://community.linkeddata.org/dataspace/person/kidehen2/this&gt; instead of &lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; then the HTTP Server should not return an information resource (i.e provide 200 OK response) when a User Agent requests a resource via HTTP using the URI: &lt;http://community.linkeddata.org/dataspace/person/kidehen2/this&gt;, because a Data Object ID (URI) and the Data Object Address (URL) cannot be the same when my Data Object isn&#39;t of Type Document; the sever has to use response code 303 to redirect the user agent to the URL of an information resource that matches the Content-type designated in the HTTP Request or determine representation based on it&#39;s own quality of service rules for the information resource associated with the Object ID (URI). The degree of unobtrusiveness of new technology, concepts, or new applications of existing technology, is what ultimately determines eventual uptake and meme virulence (network effects). For a while, the Semantic Web meme was mired in confusion and general misunderstanding due to a shortage of practical use case scenario demos. The emergence of the SPARQL Query Language has provided critical infrastructure for a number of products, projects, and demos, that now make the utility of the Semantic Web vision mush clearly via the simplicity of Linked Data, as exemplified by the following: Linking Open Data Community - collection of People and Linked Data Spaces (across a variety of domains) DBpedia - Ground zero for experiencing and comprehending Linked Data OpenLink Data Spaces - a simple solution for creating Linked Data Web presence via from existing Web Data Sources (Blogs, Wikis, Shared Bookmarks, Tag Spaces, Web Sites, Social Networking Services, Web Services, Discussion Forums etc..) OpenLink Virtuoso - a Universal Server for generating, managing, and deploying RDF Linked Data from SQL, XML, Web Services based data sources Why Is This Post a Linked Data Demo, Again? Place the permalink of this post in a Linked Data aware user agent (OpenLink RDF Browser1, OpenLink RDF Browser2, Zitgist, DISCO, Tabulator), and the you can see the universal of interlinked data exposed by this post. The Title of this post should not be the sole mechanism for determining that it is Linked to other posts about the same topic. Related Ryan Tomayko&#39;s post titled: So, What Does &quot;HREF&quot; Stand For, Anyway Elias Torre&#39;s post titled: The Web FTW Cool URIs for the Semantic Web.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>As per usual I am writing this post with the aim of killing a number of <a href="http://dbpedia.org/resource/Meme" id="link-id0x1caa10d8">meme</a>-birds with a single post in relation to the emerging <a href="http://dbpedia.org/resource/Linked_Data" id="link-id156867c8">Linked Data Web</a>.</p>

<p>*On* the ubiquitous <a href="http://dbpedia.org/resource/World_Wide_Web" id="link-id0x1e5a1a08">Web</a> of &quot;Linked Documents&quot;, HREF means (by definition and usage): <a href="http://dbpedia.org/resource/Hypertext" id="link-id16078f10">Hypertext</a> Reference to an <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x9e840368">HTTP</a> accessible <a href="http://dbpedia.org/resource/Data" id="link-id0x9e570ce8">Data</a> Object of Type: &quot;Document&quot; (an <a href="http://dbpedia.org/resource/Information" id="link-id0xccc6ee8">information</a> resource). Of course we don&#39;t make the formal connection of Object Type when dealing with the <a href="http://dbpedia.org/resource/World_Wide_Web">Web</a> on a daily basis, but whenever you encounter the  &quot;resource not found&quot; condition  notice the message: <a href="http://dbpedia.org/resource/HTTP_404" id="link-id153b4d98">HTTP/1.0 404</a> Object Not Found, from the <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol">HTTP</a> Server tasked with retrieving and returning the resource. </p>

<p>*In* the Web of &quot;<a href="http://dbpedia.org/resource/Linked_Data" id="link-id0x9ed9fb78">Linked Data</a>&quot;, a complimentary addition to the current Web of &quot;Linked Documents&quot;, HREF is used to reference <a href="http://dbpedia.org/resource/Data">Data</a> Objects that are of a variety of &quot;Types&quot;, not just &quot;Documents&quot;. And the way this is achieved, is by using <a href="http://dbpedia.org/resource/Surrogate_key" id="link-id153d4438">Data Object Identifiers</a> (URIs / IRIs that are generated by the <a href="http://dbpedia.org/resource/Linked_Data">Linked Data</a> deployment platform) in the strict sense i.e. Data Identity (<a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id0xc9ef280">URI</a>) is separated from Data Address (<a href="http://dbpedia.org/resource/Uniform_Resource_Locator" id="link-id0x1cb62390">URL</a>). Thus, you can reference a Person Data Object (aka an instance of a Person Class) in your HREF and the <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id1554e458">HTTP</a> Server returns a Description of the Data Object via a Document (again, an <a href="http://dbpedia.org/resource/Information">information</a> resource). A document containing the Description of a Data Object typically contains HREFs to other Data Objects that expose the Attributes and Relationships of the initial Person Data Object, and it this collection of Data Objects that is technically called a &quot;Graph&quot; -- which is what <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0xc67a780">RDF</a> models.</p>
<blockquote>What I describe above is basic stuff for anyone that&#39;s familiar with Object Database or Distributed Objects technology and concepts.</blockquote>

<h2><a href="http://dbpedia.org/resource/Uniform_Resource_Identifier">URI</a> and <a href="http://dbpedia.org/resource/Uniform_Resource_Locator">URL</a> confusion</h2>
<p>The Linked Document Web is a collection of physical resources that traverse the Web Information Bus in palatable format i.e documents. Thus, Document Object Identity and Document Object Data Address can be the same thing i.e. a <a href="http://dbpedia.org/resource/Uniform_Resource_Locator" id="link-id1525d028">URL</a> can serve as the <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id16e50b50">ID/URI</a> of a Document Data Object.</p>

<p>The Linked Data Web on the other hand, is a Distributed Object Database, and each Data Object must be uniquely defined, otherwise we introduce ambiguity that ultimately taints the Database itself (making incomprehensible to reasoning challenged machines). Thus we must have unique Object IDs (URIs / IRIs) for People, Places, Events, and other things that aren&#39;t Documents. Once we follow the time tested rules of Identity, People can then be associated with the things they create (<a href="http://dbpedia.org/resource/Blog" id="link-id0xc7c3ce0">blog</a> posts, web pages, bookmarks, wikiwords etc). <a href="http://dbpedia.org/resource/Resource_Description_Framework">RDF</a> is about expressing these graph model relationships while RDF serialization formats enables the information resources to transport these data object link ladden information resources to requesting User Agents.</p>

<p>Put in more succinct terms, all documents on the Web are compound documents in reality (e.g. mast contain a least an image these days). The Linked Data Web is about a Web where Data Object IDs (URIs) enable us to distill source data from the information contained in a compound document.</p>

<h2>Examples:</h2>

<ol>
<li>&lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; - the ID (URI minted from URL via addition of #this) of a Data Object of Type Person that Identifies me. The Person definition I use comes from the FOAF vocabulary/schema/ontology/data dictionary</li>

<li>&lt;http://community.linkeddata.org/dataspace/person/kidehen2&gt; - the URI (also a URL) of a FOAF file that contains a description of the Data <a href="http://dbpedia.org/resource/Identity_%28object-oriented_programming%29" id="link-id0xca491e0">Object ID</a>: &lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; (me)</li>

<li>As an information resource &lt;http://community.linkeddata.org/dataspace/person/kidehen2&gt; can be dispatched from an HTTP server to a User Agent in (X)HTML, RDF/XML, N3/Turtle representations via HTTP Content Negotiation (<strong>note:</strong> Look at the &quot;Linked Data&quot; tab to see one example of what Data Links facilitate re. Data Discovery and Exploration)</li>

<li>If I choose an <a href="http://dbpedia.org/resource/Identity_%28object-oriented_programming%29">Object ID</a> of &lt;http://community.linkeddata.org/dataspace/person/kidehen2/this&gt; instead of &lt;http://community.linkeddata.org/dataspace/person/kidehen2#this&gt; then the HTTP Server should not return an information resource (i.e provide 200 OK response) when a User Agent requests a resource via HTTP using the URI: &lt;http://community.linkeddata.org/dataspace/person/kidehen2/this&gt;, because a Data Object ID (URI) and the Data Object Address (URL) cannot be the same when my Data Object isn&#39;t of Type Document; the sever has to use response code 303 to redirect the user agent to the URL of an information resource that matches the Content-type designated in the HTTP Request or determine representation based on it&#39;s own quality of service rules for the information resource associated with the Object ID (URI).</li>
</ol>
 
<p>The degree of unobtrusiveness of new technology, concepts, or new applications of existing technology, is what ultimately determines eventual uptake and <a href="http://dbpedia.org/resource/Meme">meme</a> virulence (network effects). For a while, the <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id0xc86cda0">Semantic Web</a> meme was mired in confusion and general misunderstanding due to a shortage of practical use case scenario demos.  </p>

<p>The emergence of the <a href="http://dbpedia.org/resource/SPARQL" id="link-id0xc614158">SPARQL</a> Query Language has provided critical infrastructure for a number of products, projects, and demos, that now make the utility of the <a href="http://dbpedia.org/resource/Semantic_Web">Semantic Web</a> vision mush clearly via the simplicity of Linked Data, as exemplified by the following:</p>

<ol>
<li>
  <a href="http://community.linkeddata.org/dataspace/organization/lod#this" id="link-id0xc7c19f0">Linking Open Data Community</a> - collection of People and Linked Data Spaces (across a variety of domains)</li>
<li>
  <a href="http://dbpedia.org/resource/DBpedia" id="link-id0xcb1c398">DBpedia</a>  - Ground zero for experiencing and comprehending Linked Data</li>
<li>
  <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id0xc16e458">OpenLink Data Spaces</a> - a simple solution for creating Linked Data Web presence via from existing Web Data Sources (Blogs, Wikis, Shared Bookmarks, <a href="http://dbpedia.org/resource/Tag" id="link-id0xc340200">Tag</a> Spaces,  Web Sites, Social Networking Services, Web Services, Discussion Forums etc..)</li>
<li>OpenLink <a href="http://virtuoso.openlinksw.com" id="link-id0xca83470">Virtuoso</a> - a Universal Server for generating, managing, and deploying RDF Linked Data from <a href="http://dbpedia.org/resource/SQL" id="link-id0xcce3870">SQL</a>, XML, Web Services based data sources</li>
</ol>

Why Is This Post a Linked Data Demo, Again?
Place the permalink of this post in a Linked Data aware user agent (<a href="http://demo.openlinksw.com/rdfbrowser" id="link-id17b79488">OpenLink RDF Browser1</a>, <a href="http://demo.openlinksw.com/rdfbrowser2" id="link-id15957150">OpenLink RDF Browser2</a>, <a href="http://dataviewer.zitgist.com/" id="link-id15550cf8">Zitgist</a>, <a href="http://www4.wiwiss.fu-berlin.de/rdf_browser" id="link-id1565a680">DISCO</a>, <a href="http://dig.csail.mit.edu/2005/ajar/release/tabulator/0.8/tab.html" id="link-id15700350">Tabulator</a>), and the you can see the universal of interlinked data exposed by this post. The Title of this post should not be the sole mechanism for determining that it is Linked to other posts about the same topic. 

<h2>Related</h2>
<ul>
<a href="http://tomayko.com" id="link-id15c56720">Ryan Tomayko</a>&#39;s post titled: <a href="http://tomayko.com/writings/wtf-is-an-href-anyway" id="link-id1514a328">So, What Does &quot;HREF&quot; Stand For, Anyway</a>
</ul>
<ul>
<a href="http://torrez.us/who#elias" id="link-id14eec928">Elias Torre</a>&#39;s post titled: <a href="http://torrez.us/archives/2008/03/10/563/" id="link-id15722c08">The Web FTW</a>
</ul>
<ul>
<a href="http://www.w3.org/TR/cooluris/" id="link-id1576c118">Cool URIs for the Semantic Web.</a>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-02-08#1314">
  <rss:title>10 Reasons to use OpenLink Data Spaces (ODS)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-02-08T17:33:45Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Via post by Daniel Lewis, titled:10 Reasons to use OpenLink Data Spaces There are quite a few reasons to use OpenLink Data Spaces (ODS). Here are 10 of the reasons why I use ODS: Its native support of DataPortability Recommendations such as RSS, Atom, APML, Yadis, OPML, Microformats, FOAF, SIOC, OpenID and OAuth. Its native support of Semantic Web Technologies such as: RDF and SPARQL/SPARUL for querying. Everything in ODS is an Object with its own URI, this is due to the underlying Object-Relational Architecture provided by Virtuoso. It has all the social media components that you could need, including: blogs, wikis, social networks, feed readers, CRM and a calendar. It is expandable by installing pre-configured components (called VADs), or by re-configuring a LAMP application to use Virtuoso. Some examples of current VADs include: MediaWiki, Wordpress and Drupal. It works with external webservices such as: Facebook, del.icio.us and Flickr. Everything within OpenLink Data Spaces is Linked Data, which provides more meaningful information than just plain structural information. This meaningful information could be used for complex inferencing systems, as ODS can be seen as a Knowledge Base. ODS builds bridges between the existing static-document based web (aka âWeb 1.0â), the more dynamic,Â  services-oriented, social and/or user-orientated webs (aka âWeb 2.0â) and the web which we are just going into, which is more data-orientated (aka âWeb 3.0â or âLinked Data Webâ). It is fully supportive of Cloud Computing, and can be installed on Amazon EC2. Its released free under the GNU General Public License (GPL). [note]However, it is technically dual licensed as it lays on top of the Virtuoso Universal Server which has both Commercial and GPL licensing[/note] The features above collectively provide users with a Linked Data Junction Box that may reside with corporate intranets or &quot;out in the clouds&quot; (Internet). You can consume, share, and publish data in a myriad of formats using a plethora of protocols, without any programming. ODS is simply about exposing the data from your Web 1.0, 2.0, 3.0 application interactions in structured from, with Linking, Sharing, and ultimately Meshing (not Mashing) in mind. Note: Although ODS is equipped with a broad array of Web 2.0 style Applications, you do not need to use native ODS apps in order to exploit it&#39;s power. It binds to anything that supports the relevant protocols and data formats.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Via post by <a href="http://vanirsystems.com/danielsblog" id="link-id1480d7c0">Daniel Lewis</a>, titled:<a href="http://vanirsystems.com/danielsblog/2008/02/08/10-reasons-to-use-openlink-data-spaces/#comments" id="link-id1320a618">10 Reasons to use OpenLink Data Spaces</a>
</p>
<blockquote>
<p>There are quite a few reasons to use <a href="http://en.wikipedia.org/wiki/OpenLink_Data_Space" id="link-id103eb060">OpenLink Data Spaces (ODS)</a>. Here are 10 of the reasons why I use ODS:</p>
<ol>
<li>Its native support of DataPortability Recommendations such as <a href="http://dbpedia.org/resource/RSS" id="link-id18957e88">RSS</a>, <a href="http://dbpedia.org/resource/Atom_%28standard%29" id="link-id1410a9c0">Atom</a>, <a href="http://www.apml.org/" id="link-idfde4b90">APML</a>, <a href="http://dbpedia.org/resource/Yadis" id="link-id1328c260">Yadis</a>, <a href="http://dbpedia.org/resource/OPML" id="link-id10133f70">OPML</a>, <a href="http://dbpedia.org/resource/Microformat" id="link-id16e19be0">Microformats</a>, <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id12deef98">FOAF</a>, <a href="http://dbpedia.org/resource/SIOC" id="link-id15fb99b0">SIOC</a>, <a href="http://dbpedia.org/resource/OpenID" id="link-id1390ae10">OpenID</a> and <a href="http://en.wikipedia.org/wiki/OAuth" id="link-id14dcce70">OAuth</a>.</li>
<li>Its native support of Semantic Web Technologies such as: <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id15fc75a0">RDF</a> and <a href="http://dbpedia.org/resource/SPARQL" id="link-id14255238">SPARQL</a>/<a href="http://jena.hpl.hp.com/~afs/SPARQL-Update.html" id="link-id15fe2e40">SPARUL</a> for querying.</li>
<li>Everything in ODS is an <a href="http://dbpedia.org/resource/Object_%28computer_science%29" id="link-id11c204a0">Object</a> with its own <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-id14812560">URI</a>, this is due to the underlying <a href="http://dbpedia.org/resource/Object-relational_database" id="link-idf663e08">Object-Relational</a> Architecture provided by <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server" id="link-id1484e4c8">Virtuoso</a>.</li>
<li>It has all the social media components that you could need, including: <a href="http://dbpedia.org/resource/Blog" id="link-id10120b58">blogs</a>, <a href="http://dbpedia.org/resource/Wiki" id="link-id14d9a608">wikis</a>, <a href="http://dbpedia.org/resource/Social_network_service" id="link-idf0b3a30">social networks</a>, <a href="http://dbpedia.org/resource/Aggregator" id="link-id188d7c78">feed readers</a>, <a href="http://dbpedia.org/resource/Customer_relationship_management" id="link-id134a2c48">CRM</a> and a <a href="http://dbpedia.org/resource/Calendar" id="link-idf66af80">calendar</a>.</li>
<li>It is expandable by installing pre-configured components (called VADs), or by re-configuring a <a href="http://dbpedia.org/resource/LAMP_%28software_bundle%29" id="link-id102e8008">LAMP</a> application to use <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server" id="link-id13fe2b68">Virtuoso</a>. Some examples of current VADs include: <a href="http://dbpedia.org/resource/MediaWiki" id="link-id1011d9f0">MediaWiki</a>, <a href="http://dbpedia.org/resource/WordPress" id="link-id13624060">Wordpress</a> and <a href="http://dbpedia.org/resource/Drupal" id="link-id100c4510">Drupal</a>.</li>
<li>It works with external webservices such as: <a href="http://dbpedia.org/resource/Facebook" id="link-id131fe6d0">Facebook</a>, <a href="http://dbpedia.org/resource/Del.icio.us" id="link-idfdd1580">del.icio.us</a> and <a href="http://dbpedia.org/resource/Flickr" id="link-id1496aff0">Flickr.</a>
  </li>
<li>Everything within OpenLink Data Spaces is <a href="http://dbpedia.org/resource/Linked_Data" id="link-id17114c00">Linked Data</a>, which provides more meaningful information than just plain structural information. This meaningful information could be used for complex inferencing systems, as ODS can be seen as a <a href="http://dbpedia.org/resource/Expert_system" id="link-id15ea4108">Knowledge Base</a>.</li>
<li>ODS builds bridges between the existing static-document based web (aka â<a href="http://dbpedia.org/resource/Web_1.0" id="link-idf08b338">Web 1.0</a>â), the more dynamic,Â  services-oriented, social and/or user-orientated webs (aka â<a href="http://dbpedia.org/resource/Web_2.0" id="link-idfde26e0">Web 2.0</a>â) and the web which we are just going into, which is more data-orientated (aka â<a href="http://dbpedia.org/resource/Web_3.0" id="link-idf9b7328">Web 3.0</a>â or âLinked Data Webâ).</li>
<li>It is fully supportive of <a href="http://dbpedia.org/resource/Cloud_computing" id="link-id189480d0">Cloud Computing</a>, and can be installed on <a href="http://dbpedia.org/resource/Amazon_Elastic_Compute_Cloud" id="link-id10026778">Amazon EC2</a>.</li>
<li>Its released free under the GNU <a href="http://dbpedia.org/resource/GNU_General_Public_License" id="link-id16002fb0">General Public License (GPL)</a>. [note]However, it is technically dual licensed as it lays on top of the <a href="http://en.wikipedia.org/wiki/Virtuoso_Universal_Server" id="link-id132d4238">Virtuoso Universal Server</a> which has both Commercial and GPL licensing[/note]</li>
</ol>
</blockquote>
<p>The features above collectively provide users with a Linked Data Junction Box that may reside with corporate intranets or &quot;out in the clouds&quot; (Internet). You can consume, share, and publish data in a myriad of formats using a plethora of protocols, without any programming. ODS is simply about exposing the data from your Web 1.0, 2.0, 3.0 application interactions in structured from, with Linking, Sharing, and ultimately Meshing (not Mashing) in mind.</p>

<p>
<strong>Note:</strong> Although ODS is equipped with a broad array of Web 2.0 style Applications, you do not need to use native ODS apps in order to exploit it&#39;s power. It binds to anything that supports the relevant protocols and data formats.</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-31#1306">
  <rss:title>FOAF-ing Linked Data is quite SIOC-ing</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-01-31T02:40:12Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">The title of this post is a &quot;Tongue in cheek&quot; expression of euphoria now that I have FOAF and SIOC (pronounced SHOCK) based data spaces exposed via my FOAF and my SIOC information resource (RDF files) URIs. If you want to explore who I know, what I read, and what I&#39;ve tagged (amongst other things), all you have to do is: Beam a SPARQL query down my data space URIs which expose FOAF or SIOC based interconnected Linked Data graphs. Walkthrough using an RDF Browser until you reach a beachhead and then beam your SPARQL from there (remember you only need the URI of the RDF Data Source, and while in my Data Space every data item has a proper URI). Some Tools that help you comprehend what I am saying: Browsers Zitgist Data Viewer (SIOC and FOAF data spaces) OpenLink RDF Browser (SIOC and FOAF data spaces) DISCO (SIOC and FOAF data spaces) Tabulator Query Tools SPARQL Demo iSPARQL QBE</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>The title of this post is a &quot;Tongue in cheek&quot; expression of euphoria now that I have <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-idfa63488">FOAF</a> and <a href="http://dbpedia.org/resource/SIOC" id="link-idfa976f0">SIOC</a> (pronounced SHOCK) based data spaces exposed via <a href="http://myopenlink.net/dataspace/person/kidehen" id="link-idfde41f8">my FOAF</a> and <a href="http://myopenlink.net/dataspace/kidehen" id="link-idfdca6c8">my SIOC</a> information resource (<a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id16d0b0d8">RDF</a> files) <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier" id="link-idfa97070">URI</a>s.</p>

<p>If you want to explore who I know, what I read, and what I&#39;ve tagged (amongst other things), all you have to do is:</p>

<ol>
<li>Beam a <a href="http://dbpedia.org/resource/SPARQL" id="link-idfdca878">SPARQL</a> query down my data space URIs which expose FOAF or SIOC based interconnected <a href="http://dbpedia.org/resource/Linked_Data" id="link-idfa954e8">Linked Data</a> graphs.</li>
<li>
Walkthrough using an RDF Browser until you reach a beachhead and then beam your SPARQL from there (remember you only need the URI of the RDF Data Source, and while in my Data Space every data item has a proper URI).</li>
</ol>

<p>Some Tools that help you comprehend what I am saying:</p>

<h2>Browsers</h2>
<ul>
Zitgist Data Viewer (<a href="http://dataviewer.zitgist.com/?uri=http%3A//myopenlink.net/dataspace/kidehen" id="link-id16d410c0">SIOC</a> and <a href="http://dataviewer.zitgist.com/?uri=http%3A//myopenlink.net/dataspace/person/kidehen" id="link-idfa489e8">FOAF</a> data spaces)</ul>
<ul>OpenLink RDF Browser (<a href="http://demo.openlinksw.com/rdfbrowser/?uri=http%3A%2F%2Fmyopenlink.net%2Fdataspace%2Fkidehen" id="link-idfa8b0d8">SIOC</a> and <a href="http://demo.openlinksw.com/rdfbrowser/?uri=http%3A%2F%2Fmyopenlink.net%2Fdataspace%2Fperson%2Fkidehen" id="link-idfa974a8">FOAF</a> data spaces)</ul>
<ul>DISCO (<a href="http://www4.wiwiss.fu-berlin.de/rdf_browser/?browse_uri=http%3A%2F%2Fmyopenlink.net%2Fdataspace%2Fkidehen%2Fspace%23this" id="link-idfa62288">SIOC</a> and <a href="http://myopenlink.net/dataspace/person/kidehen#this" id="link-idf940338">FOAF</a> data spaces)</ul>
<ul>
<a href="http://dig.csail.mit.edu/2005/ajar/release/tabulator/0.8/tab.html" id="link-id16d6a4b8">Tabulator</a>
</ul>

<h2>Query Tools</h2>

<ul>
<a href="http://demo.openlinksw.com/sparql_demo" id="link-idfdd43b8">SPARQL Demo</a>
</ul>
<ul>
<a href="http://demo.openlinksw.com/isparql" id="link-idfa96bd0">iSPARQL QBE</a>
</ul>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-17#1300">
  <rss:title>Semantic Data Web Epiphanies: One Node at a Time</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-01-17T22:59:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">In 2006, I stumbled across Jason Kolb (online) via a 4-part series of posts titled: Reinventing the Internet. At the time, I realized that Jason was postulating about what is popularly known today as &quot;Data Portability&quot;, so I made contact with him (blogosphere style) via a post of my own titled: Data Spaces, Internet Reinvention, and the Semantic Web. Naturally, I tried to unveil to Jason the connection between his vision and the essence of the Semantic Web. Of course, he was skeptical :-) Jason recently moved to Massachusetts which lead to me pinging him about our earlier blogosphere encounter and the emergence of a Data Portability Community. I also informed him about the fact that TimBL, myself, and a number of other Semantic Web technology enthusiasts, frequently meet on the 2nd Tuesday of each month at the MIT hosted Cambridge Semantic Web Gatherings, to discuss, demonstrate, debate all aspects of the Semantic Web. Luckily (for both of us), Jason attended the last event, and we got to meet each other in person. Following our face to face meeting in Cambridge, a number of follow-on conversations ensued covering, Linked Data and practical applications of the Semantic Web vision. Jason writes about our exchanges a recent post titled: The Semantic Web. His passion for Data Portability enabled me to use OpenID and FOAF integration to connect the Semantic Web and Data Portability via the Linked Data concept. During our conversations, Jason also eluded to the fact that he had already encountered OpenLink Software while working with our ODBC Drivers (part of or UDA product family) for IBM Informix (Single-Tier or Multi-Tier Editions) a few years ago (interesting random connection). As I&#39;ve stated in the past, I&#39;ve always felt that the Semantic Web vision will materialize by way of a global epiphany. The count down to this inevitable event started at the birth of the blogosphere, ironically. And accelerated more recently, through the emergence of Web 2.0 and Social Networking, even more ironically :-) The blogosphere started the process of Data Space coalescence via RSS/Atom based semi-strucutured data enclaves, Web 2.0 RDFpropagated Web Service usage en route to creating service provider controlled, data and information silosRDF, Social NetworkingRDF brought attention to the fact that User Generated Data wasn&#39;t actually owned or controlled by the Data Creators etc. The emergence of &quot;Data Portability&quot; has created a palatable moniker for a clearly defined, and slightly easier to understand, problem: the meshing of Data and Identity in cyberspace i.e. individual points of presence in cyberspace, in the form of &quot;Personal Data Spaces in the Clouds&quot; (think: doing really powerful stuff with .name domains). In a sense, this is the critical inflection point between the document centric &quot;Web of Linked Documents&quot; and the data centric &quot;Web or Linked Data&quot;. There is absolutely no other way solve this problem in a manner that alleviates the imminent challenges presented by information overload -- resulting from the exponential growth of user generated data across the Internet and enterprise Intranets.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>In 2006, I stumbled across <a href="http://www.jasonkolb.com" id="link-id17165b98">Jason Kolb</a> (online) via a 4-part series of posts titled: <a href="http://www.jasonkolb.com/weblog/2006/08/reinventing_the_1.html" id="link-id14204cf8">Reinventing the Internet</a>. At the time, I realized that Jason was postulating about what is popularly known today as &quot;<a href="http://en.wikipedia.org/wiki/Data_portability" id="link-id1412b280">Data Portability</a>&quot;, so I made contact with him (blogosphere style) via a post of my own titled: <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1033" id="link-id13b1cb20">Data Spaces, Internet Reinvention, and the Semantic Web</a>. Naturally, I tried to unveil to Jason the connection between his vision and the essence of the <a href="http://dbpedia.org/resource/Semantic_Web" id="link-id143117f0">Semantic Web</a>. Of course, he was skeptical :-)</p>

<p>Jason recently moved to <a href="http://dbpedia.org/resource/Massachusetts" id="link-id13c4a470">Massachusetts</a> which lead to me pinging him about our earlier blogosphere encounter and the emergence of a <a href="http://dataportability.org/" id="link-id17395c60">Data Portability Community</a>. I also informed him about the fact that <a href="http://dbpedia.org/resource/Tim_Berners-Lee" id="link-id105507f0">TimBL</a>, myself, and a number of other Semantic Web technology enthusiasts, frequently meet on the 2nd Tuesday of each month at the <a href="http://dbpedia.org/resource/Massachusetts_Institute_of_Technology" id="link-id1719f798">MIT</a> hosted <a href="http://esw.w3.org/topic/CambridgeSemanticWebGatherings" id="link-id1734d460">Cambridge Semantic Web Gatherings</a>, to discuss, demonstrate, debate all aspects of the Semantic Web. Luckily (for both of us), Jason attended the last event, and we got to meet each other in person.</p>

<p>Following our face to face meeting in Cambridge,  a number of follow-on conversations ensued covering,  Linked Data and practical applications of the Semantic Web vision. Jason writes about our exchanges a recent post titled: <a href="http://www.jasonkolb.com/weblog/2008/01/the-semantic-we.html" id="link-id13be6280">The Semantic Web</a>. His passion for Data Portability enabled me to use <a href="http://esw.w3.org/topic/FoafOpenid" id="link-id141516a8">OpenID and FOAF integration</a> to connect the Semantic Web and Data Portability via the Linked Data concept.</p>

<p>During our conversations, Jason also eluded to the fact that he had already encountered <a href="http://en.wikipedia.org/wiki/OpenLink_Software" id="link-id17038218">OpenLink Software</a> while working with our <a href="http://data.openlinksw.com/oplweb/product_category/odbc#this" id="link-id14325f08">ODBC Drivers</a> (part of or <a href="http://data.openlinksw.com/oplweb/product_family/uda#this" id="link-id11ab1008">UDA product family</a>) for <a href="http://dbpedia.org/resource/Informix" id="link-id125858d0">IBM Informix</a> (<a href="http://data.openlinksw.com/oplweb/product/odbc-informix-st#this" id="link-id13b85e30">Single-Tier</a> or <a href="http://data.openlinksw.com/oplweb/product/odbc-informix-mt#this" id="link-id13edceb0">Multi-Tier</a> Editions) a few years ago (interesting random connection).</p>

<p>As I&#39;ve stated in the past, I&#39;ve always felt that the Semantic Web vision will materialize by way of a global epiphany. The count down to this inevitable event started at the birth of the blogosphere, ironically. And accelerated more recently, through the emergence of <a href="http://dbpedia.org/resource/Web_2.0" id="link-id171d4ec8">Web 2.0</a> and <a href="http://dbpedia.org/page/Social_network" id="link-id140da830">Social Networking</a>, even more ironically :-)</p>

<p>The blogosphere started the process of Data Space coalescence via RSS/Atom based semi-strucutured data enclaves, Web 2.0 RDFpropagated Web Service usage en route to creating service provider controlled, data and information silosRDF, Social NetworkingRDF brought attention to the fact that User Generated Data wasn&#39;t actually owned or controlled by the Data Creators etc.</p>

<p>The emergence of &quot;Data Portability&quot; has created a palatable moniker for a clearly defined, and slightly easier to understand, problem: the meshing of Data and Identity in cyberspace i.e. individual points of presence in cyberspace, in the form of &quot;Personal Data Spaces in the Clouds&quot; (think: doing really powerful stuff with .name domains). In a sense, this is the critical inflection point between the document centric &quot;Web of Linked Documents&quot; and the data centric &quot;Web or Linked Data&quot;.  There is absolutely no other way solve this problem in a manner that alleviates the imminent challenges presented by information overload -- resulting from the exponential growth of user generated data across the Internet and enterprise Intranets.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-15#1295">
  <rss:title>W3C&#39;s SPARQLing Data Access Ingenuity</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-01-15T22:58:53Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">The W3C officially unveiled the SPARQL Query Language today via a press release titled: W3C Opens Data on the Web with SPARQL. What is SPARQL? A query language for the burgeoning Structured &amp; Linked Data Web (aka Semantic Web / Giant Global Graph). Like SQL, for the Relational Data Model, it provides a query language for the Graph based RDF Data Model. It&#39;s also a REST or SOAP based Web Service that exposes SPARQL access to RDF Data via an endpoint. In addition, it&#39;s also a Query Results Serialization format that includes XML and JSON support. Why is it Important? It brings important clarity to the notion of the &quot;Web as a Database&quot; by transforming existing Web Sites, Portals, and Web Services into bona fide corpus of Mesh-able (rather than Mash-able) Data Sources. For instance, you can perform queries that join one or more of the aforementioned data sources in exactly the same manner (albeit different syntax) as you would one or more SQL Tables. Example: -- SPARQL equivalent of SQL SELECT * against my personal data space hosted FOAF file SELECT DISTINCT ?s ?p ?o FROM &lt;http://myopenlink.net/dataspace/person/kidehen&gt; WHERE {?s ?p ?o} -- SPARQL against my social network -- Note: My SPARQL will be beamed across all of contacts in the social networks of my contacts as long as they are all HTTP URI based within each data space PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; SELECT DISTINCT ?Person FROM &lt;http://myopenlink.net/dataspace/person/kidehen&gt; WHERE {?s a foaf:Person; foaf:knows ?Person} Note: you can use the basic SPARQL Endpoint, SPARQL Query By Example, or SPARQL Query Builder Demo tool to experiment with the demonstration queries above. How Do I use It? SPARQL is implemented by RDF Data Management Systems (Triple or Quad Stores) just as SQL is implemented by Relational Database Management Systems. The aforementioned data management systems will typically expose SPARQL access via a SPARQL endpoint. Where are it&#39;s implementations? A SPARQL implementors Testimonial page accompanies the SPARQL press release. In addition the is a growing collection of implementations on the ESW Wiki Page for SPARQL compliant RDF Triple &amp; Quad Stores. Is this really a big deal? Yes! SPARQL facilitates an unobtrusive manifestation of a Linked Data Web by way of natural extension of the existing Document Web i.e these Web enclaves co-exist in symbiotic fashion. As DBpedia very clearly demonstrates, Linked Data makes the Semantic Web demonstrable and much easier to comprehend. Without SPARQL there would be no mechanism for Linked Data deployment, and without Linked Data there is no mechanism for Beaming Queries (directly or indirectly) across the Giant Global Graph of data hosted by Social Networks, Shard Bookmarks Services, Weblogs, Wikis, RSS/Atom/OPML feeds, Photo Galleries and other Web accessible Data Sources (Data Spaces). Related items Cool URIs Publishing Linked Data Tutorial Detailed SPARQL Query Examples using SIOC Data Spaces Detailed SPARQL Query Examples using FOAF Data Spaces</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>The W3C officially unveiled the SPARQL Query Language today via a press release titled: <a href="http://www.w3.org/2007/12/sparql-pressrelease" id="link-id10074ca8">W3C Opens Data on the Web with SPARQL</a>.</p>

<h2>What is <a href="http://dbpedia.org/resource/SPARQL" id="link-id10183f60">SPARQL</a>?</h2>
<p>A query language for the burgeoning Structured &amp; <a href="http://dbpedia.org/resource/Linked_Data" id="link-id10426b18">Linked Data</a> Web (aka <a href="http://dbpedia.org/resource/Semantic_Web" id="link-idffde090">Semantic Web</a> / <a href="http://en.wikipedia.org/wiki/Giant_Global_Graph" id="link-id103e3688">Giant Global Graph</a>). Like <a href="http://dbpedia.org/resource/SQL" id="link-id103365f8">SQL</a>, for the Relational Data Model, it provides a query language for the Graph based <a href="http://dbpedia.org/resource/RDF" id="link-id103e33e8">RDF</a> Data Model.</p>

<p>It&#39;s also a <a href="http://dbpedia.org/resource/Representational_State_Transfer" id="link-id1036a3d0">REST</a> or <a href="http://dbpedia.org/resource/SOAP" id="link-id103b36d8">SOAP</a> based Web Service that exposes SPARQL access to RDF Data via an endpoint.
</p>
<p>In addition, it&#39;s also a Query Results Serialization format that includes <a href="http://dbpedia.org/resource/XML" id="link-id1023bc60">XML</a> and <a href="http://dbpedia.org/resource/JSON" id="link-id102c3f88">JSON</a> support.</p>

<h2>Why is it Important?</h2>
<p>It brings important clarity to the notion of the &quot;Web as a Database&quot; by transforming existing Web Sites, Portals, and Web Services into bona fide corpus of Mesh-able (rather than Mash-able) Data Sources. For instance, you can perform queries that join one or more of the aforementioned data sources in exactly the same manner (albeit different syntax) as you would one or more SQL Tables. </p>

<h3>Example:</h3>
<p>-- SPARQL equivalent of SQL SELECT * against my personal data space hosted FOAF file</p>
<b><pre>
SELECT DISTINCT ?s ?p ?o
FROM &lt;http://myopenlink.net/dataspace/person/kidehen&gt; 
WHERE {?s ?p ?o}</pre></b>


<p>-- SPARQL against my social network
-- Note: My SPARQL will be beamed across all of contacts in the social networks of my contacts as long as they are all HTTP URI based within each data space</p>
<b><pre>PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
SELECT DISTINCT ?Person
FROM &lt;http://myopenlink.net/dataspace/person/kidehen&gt;
WHERE {?s a foaf:Person; foaf:knows ?Person}</pre></b>

<p>Note: you can use the basic <a href="http://demo.openlinksw.com/sparql" id="link-id1007d9b8">SPARQL Endpoint</a>, <a href="http://demo.openlinksw.com/isparql" id="link-id102c3e08">SPARQL Query By Example</a>, or <a href="http://demo.openlinksw.com/sparql_demo" id="link-id10201f98">SPARQL Query Builder Demo tool</a> to experiment with the demonstration queries above.</p>

<h2>How Do I use It?</h2>
<p>SPARQL is implemented by RDF Data Management Systems (Triple or Quad Stores) just as SQL is implemented by Relational Database Management Systems. The aforementioned data management systems will typically expose SPARQL access via a SPARQL endpoint.</p>

<h2>Where are it&#39;s implementations?</h2>
<p>A SPARQL implementors Testimonial page accompanies the SPARQL press release. In addition the is a growing collection of implementations on the<a href="http://esw.w3.org/topic/SparqlImplementations" id="link-id10066ca8"> ESW Wiki Page for SPARQL compliant RDF Triple &amp; Quad Stores</a>.</p>

<h2>Is this really a big deal?</h2>

<p>Yes! SPARQL facilitates an<a href="http://virtuoso.openlinksw.com/presentations/Virtuoso_Sponger_1/Virtuoso_Sponger_1.html" id="link-id101ee5b0"> unobtrusive manifestation of a Linked Data Web</a> by way of natural extension of the existing Document Web i.e these Web enclaves co-exist in symbiotic fashion. </p>

<p>As <a href="http://dbpedia.org" id="link-id1037edc0">DBpedia</a> very clearly demonstrates, Linked Data makes the Semantic Web demonstrable and much easier to comprehend. Without SPARQL there would be no mechanism for <a href="http://virtuoso.openlinksw.com/presentations/Virtuoso_Deploying_Linked_Data/Virtuoso_Deploying_Linked_Data.html" id="link-id10455da8">Linked Data deployment</a>, and without Linked Data there is no mechanism for Beaming Queries (directly or indirectly) across the Giant Global Graph of data hosted by Social Networks, Shard Bookmarks Services, Weblogs, Wikis, RSS/Atom/OPML feeds, Photo Galleries and other Web accessible Data Sources (Data Spaces).</p>

<h2>Related items</h2>
<ul>
<a href="http://www.w3.org/TR/cooluris/" id="link-id102021d8">Cool URIs</a>
</ul>
<ul>
<a href="http://sites.wiwiss.fu-berlin.de/suhl/bizer/pub/LinkedDataTutorial/" id="link-id1020d5c0">Publishing Linked Data Tutorial</a>
</ul>
<ul a="a" href="http://virtuoso.openlinksw.com/wiki/main/Main/ODSSIOCRef"> Detailed SPARQL Query Examples using SIOC Data Spaces</ul>
<ul>
<a href="http://virtuoso.openlinksw.com/wiki/main/Main/ODSFOAFRef" id="link-id102c4608">Detailed SPARQL Query Examples using FOAF Data Spaces</a>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2008-01-04#1288">
  <rss:title>OpenOffice.org, SPARQL, and the Linked Data Web</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2008-01-05T02:50:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Question posed by Dan Brickley via a blog post: SQL, OpenOffice: would a JDBC driver for SPARQL protocol make sense? Writing a JDBC Driver for SPARQL is a little overkill. OpenOffice.org simply needs to make XML or Web Data (HTML, XHTML, and XML) bonafide data sources within its &quot;Pivot Table&quot; functionality realm. Then all that would then be required is a SPARQL SELECT Query transported via the SPARQL Protocol with results sent back using the SPARQL XML results serialization format (all part of a single SPARQL Protocol URL). Excel successfully consumes the following information resource URI: http://tinyurl.com/yvoccj (a tiny url for a SPARQL SELECT against my FOAF file). Alternatively, and currently achievable, you could simply use SPASQL (SPARQL within SQL) using a DBMS engine that supports SQL, SPARQL, and SPARQL e.g. Virtuoso. Virtuoso SPASQL support is exposed via it&#39;s ODBC and/or JDBC Drivers. Thus you can do things such as: Use a SPARQL Query in the FROM CLAUSE of a SQL statement Execute SPARQL via SQL processor by prepending SPARQL query text with the literals &quot;sparql&quot; BTW - My News Years Resolution: get my act together and shrink the ever increasing list of &quot;simple &amp; practical Virtuoso use case demos&quot; on my todo which now spans all the way back to 2006 :-(</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Question posed by Dan Brickley via a blog post: SQL, OpenOffice: <a href="http://danbri.org/words/2008/01/04/245" id="link-id1689abd8">would a JDBC driver for SPARQL protocol make sense?</a>
</p>


<p>Writing a <a href="http://dbpedia.org/resource/JDBC_driver" id="link-id16a96580">JDBC Driver</a> for <a href="http://dbpedia.org/resource/SPARQL" id="link-id1a908a70">SPARQL</a> is a little overkill. <a href="http://dbpedia.org/resource/OpenOffice.org" id="link-id16ae69a8">OpenOffice.org</a> simply needs to make <a href="http://dbpedia.org/resource/XML" id="link-id168d3880">XML</a> or Web Data (<a href="http://dbpedia.org/resource/HTML" id="link-id1a7f1f50">HTML</a>, <a href="http://dbpedia.org/resource/XHTML" id="link-id16c1ae60">XHTML</a>, and XML) bonafide data sources within its &quot;<a href="http://dbpedia.org/resource/Pivot_table" id="link-id16665398">Pivot Table</a>&quot; functionality realm.  Then all that would then be required is a <a href="http://www.w3.org/TR/rdf-sparql-query/#select" id="link-id168bcbe8">SPARQL SELECT Query</a> transported via the <a href="http://www.w3.org/TR/rdf-sparql-protocol/" id="link-id16c1bbc0">SPARQL Protocol</a> with results sent back using the <a href="http://www.w3.org/TR/rdf-sparql-XMLres/" id="link-id1aa61118">SPARQL XML results serialization</a> format (all part of a single SPARQL Protocol URL).</p>
<p>Excel successfully consumes the following information resource URI: http://tinyurl.com/yvoccj (a tiny url for a SPARQL SELECT against my<a href="http://myopenlink.net/dataspace/person/kidehen" id="link-id16702ba8"> FOAF file</a>).</p>

<p>Alternatively, and currently achievable, you could simply use <a href="http://esw.w3.org/topic/SPASQL" id="link-id1a1b6b78">SPASQL</a> (SPARQL within SQL) using a <a href="http://dbpedia.org/resource/DBMS" id="link-id1661f240">DBMS</a> engine that supports SQL, SPARQL, and SPARQL e.g. <a href="http://dbpedia.org/resource/Virtuoso_Universal_Server" id="link-id168bba60">Virtuoso</a>. </p>

<p>
<a href="http://docs.openlinksw.com/virtuoso/rdfapiandsql.html" id="link-id167d9508">Virtuoso SPASQL support</a> is exposed via it&#39;s <a href="http://dbpedia.org/resource/Open_Database_Connectivity" id="link-id16c62160">ODBC</a> and/or JDBC Drivers.  Thus you can do things such as:
</p>
<ol>
<li>Use a SPARQL Query in the FROM CLAUSE of a <a href="http://dbpedia.org/resource/SQL" id="link-id1657a3a8">SQL</a> statement</li>
<li>Execute  SPARQL via SQL processor by prepending SPARQL query text with the literals &quot;sparql&quot; </li>
</ol>

<p>BTW - My News Years Resolution: get my act together and shrink the ever increasing list of &quot;simple &amp; practical Virtuoso use case demos&quot; on my todo which now spans all the way back to 2006 :-(</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2007-11-21#1272">
  <rss:title>Social Web RDF Store Benchmark</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-11-21T13:07:05Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Social Web RDF Store Benchmark Elaborating on my previous post, as food for thought for an RDF store benchmarking activity under the W3C, I present the following rough sketch. At the end of the below, I propose some common business questions that should be answered by a social web aggregator. The problem with these is that it is not really possible to ask interesting questions over a large database without involving some sort of counting and grouping. I feel that we simply cannot make a representative benchmark without these, quite regardless of the fact that SPARQL in its present form does not have these features. Hence I have simply stated the questions and left any implementation open. If this seems like an interesting direction, the nascent W3C benchmarking XG (experimental group) can refine the business questions, relative query frequencies, exact data set composition, etc. Social Web RDF Benchmark by Orri Erling Goals This benchmark model&#39;s use of RDF for representing and analyzing use of social software by user communities. The benchmark consists of a scalable synthetic data set, a feed of updates to the data set, and a query mix. The data set reflects the common characteristics of the social web, with realistic distribution of connections, user contributed content, commenting, tagging, and other social web activities. The data set is expressed in the FOAF and SIOC vocabularies. The query mix is divided between relatively short, dashboard or search engine style lookups, and longer running analytics queries. The system being modeled is an an aggregator of social web content; we could liken it to an RDF-based Technorati with some extra features. Users can publish their favorite queries or mesh-ups as logical views served by the system. In this manner, queries come to depend on other queries, somewhat like SQL VIEWs can reference each other. There is a small qualification data set that can be tested against the queries to validate that the system under test (SUT) produces the correct results. The benchmark is scaled by number of users. To facilitate comparison, some predefined scales are offered, i.e., 100K, 300K, 1M, 3M, 10M users. Each simulated user both produces and consumes content. The level of activity of users is unevenly divided. There are two work mixes â the browsing mix, which consists of a mix of lookups and contributing content, and the analytics mix, which consists of long-running queries for tracking the state of the network. For each 100 browsing mixes, one analytics mix is performed. A benchmark run is at least 1h real-time in duration. The metric is calculated by the number of browsing mixes completed during the test window. This simulates 10% of the users being online at any one time, thus for a scale of 1M users, 100K browsing mixes will be simultaneously proceeding. The test driver submits the work via HTTP. What load balancing or degree of parallel serving of the requests is used is left up to the SUT. The metric is expressed as queries per second, taking the total number of queries executed by completed browsing mixes and dividing this by the real time of the measurement window. The metric is called qpsSW, for queries per second, socialweb. The cost metric is $/qpsSW, calculated by the costing rules of the TPC. If compute-on-demand infrastructure is used, the costing will be $/qpsSW/day. The test sponsor is the party contributing the result. The contribution consists of the metric and of a full disclosure report (FDR), written following a template given in the benchmark specification. The disclosure requirements follow the TPC practices, including publishing any configuration scripts, data definition language statements, timing for warm-up and test window, times for individual queries etc. All details of the hardware and software are disclosed. Test Support Software The software consists of the data generator and of a test driver. The test driver calls functions supplied by the test sponsor for performing the diverse operations in the test. Source code for any modifications of the test driver is to be published as part of the FDR. Rules for SUT Any hardware/software combination â including single machines, clusters, clusters rented from computer providers like Amazon EC2 â is eligible. The SUT must produce correct answers for the validation queries against the validation data set. The implementation of the queries is not restricted. These can be any SPARQL or other queries, application server based logic, stored procedures or other, in any language, provided full source code is provided in the FDR. The data set is provided as serialized RDF. The means of storage are left up to the SUT. The basic intention is to use a triple store of some form, but the specific indexing, use of property tables, materialized views, and so forth, is left up to the test sponsor. All tuning and configuration is to be published in the FDR. Simulated Workload For each operation of each mix, the specification shall present: The logical intent of the operation, the business question, e.g., What is the hot topic among my friends? The question or update expressed in terms of the data in the data set. Sample text of a query answering the question or pseudo-code for deriving the answer. Result set layout, if applicable. The relative frequencies of the queries are given in the query mix summary. Browsing Mix The browsing mix consists of the following operations: Updates Make a blog post. Make a blog comment. Make a new social contact. For one new social contact, there are 10 posts and 20 comments. Queries What are the 10 most recent posts by somebody in my friends or their friends? This would be a typical dashboard item. What are the authoritative bloggers on topic x? This is a moderately complex ad-hoc query. Take posts tagged with the topic, count links to them, take the blogs containing them, show the 10 most cited blogs with the most recent posts with the tag. This would be typical of a stored query, like a parameterizable report. How do I contact person x? Calculate the chain of common acquaintances best for reaching person x. For practicality, we do not do a full walk of anything but just take the distinct persons in 2 steps of the user and in 2 steps of x and see the intersection. Who are the people like me? Find the top 10 people ranked by count of tags in common in the person&#39;s tag cloud. The tag cloud is the set of interests and the set of tags in blog posts of the person. Who react to or talk about me? Count of replies to material by the user, grouped by the commenting user and the site of the comment, top 20, sorted by count descending. Who are my fans that I do not know? Same as above, excluding people within 2 steps. Who are my competitors? Most prolific posters on topics of my interest that do not cite me. Where is the action? On forums where I participate, what are the top 5 threads, as measured by posts in the last day. Show count of posts in the last day and the day before that. How do I get there? Who are the people active around both topic x and y? This is defined by a person having participated during the last year in forums of x as well as of y. Forums are tagged by topics. The most active users are first. The ranking is proportional to the sum of the number of posts in x and y. Analytic Mix These queries are typical questions about the state of the conversation space as a whole and can for example be published as a weekly summary page. The fastest propagating idea - What is the topic with the most users who have joined in the last day? A user is considered to have joined if the user was not discussing this in the past 10 days. Prime movers - What users start conversations? A conversation is the set of material in reply to or citing a post. The reply distance can be arbitrarily long, the citing distance is a direct link to the original post or a reply there to. The number and extent of conversations contribute towards the score. Geography - Over the last 10 days, for each geographic area, show the top 50 tags. The location is the location of the poster. Social hubs - For each community, get the top 5 people who are central to it in terms of number of links to other members of the same community and in terms of being linked from posts. A community is the set of forums that have a specific topic.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<div style="display:none;">Social Web RDF Store Benchmark</div>
<p>Elaborating on <a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1269" id="link-idfe9e1d8">my previous post</a>, as food for thought for an <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1df25610">RDF</a> store benchmarking activity under the W3C, I present the following rough sketch. At the end of the below, I propose some common business questions that should be answered by a social web aggregator.</p>
<p>The problem with these is that it is not really possible to ask interesting questions over a large database without involving some sort of counting and grouping. I feel that we simply cannot make a representative benchmark without these, quite regardless of the fact that <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1e92c5e8">SPARQL</a> in its present form does not have these features. Hence I have simply stated the questions and left any implementation open. If this seems like an interesting direction, the nascent W3C benchmarking XG (experimental group) can refine the business questions, relative query frequencies, exact <a href="http://dbpedia.org/resource/Data" id="link-id0x1ae090c8">data</a> set composition, etc.</p>
<h3>Social Web RDF Benchmark </h3>
<p>
<i>by Orri Erling</i>
</p>
<h4>Goals</h4>
<p>This benchmark model&#39;s use of RDF for representing and analyzing use of social software by user communities. The benchmark consists of a scalable synthetic data set, a feed of updates to the data set, and a query mix. The data set reflects the common characteristics of the social web, with realistic distribution of connections, user contributed content, commenting, tagging, and other social web activities. The data set is expressed in the FOAF and SIOC vocabularies. The query mix is divided between relatively short, dashboard or search engine style lookups, and longer running analytics queries.</p>
<p>The system being modeled is an an aggregator of social web content; we could liken it to an RDF-based Technorati with some extra features.</p>
<p>Users can publish their favorite queries or mesh-ups as logical views served by the system. In this manner, queries come to depend on other queries, somewhat like <a href="http://dbpedia.org/resource/SQL" id="link-id0x1e537ea0">SQL</a> VIEWs can reference each other.</p>
<p>There is a small qualification data set that can be tested against the queries to validate that the system under test (SUT) produces the correct results.</p>
<p>The benchmark is scaled by number of users. To facilitate comparison, some predefined scales are offered, i.e., 100K, 300K, 1M, 3M, 10M users. Each simulated user both produces and consumes content. The level of activity of users is unevenly divided.</p>
<p>There are two work mixes â the browsing mix, which consists of a mix of lookups and contributing content, and the analytics mix, which consists of long-running queries for tracking the state of the network. For each 100 browsing mixes, one analytics mix is performed.</p>
<p>A benchmark run is at least 1h real-time in duration. The metric is calculated by the number of browsing mixes completed during the test window. This simulates 10% of the users being online at any one time, thus for a scale of 1M users, 100K browsing mixes will be simultaneously proceeding.</p>
<p>The test driver submits the work via <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x1dee3f48">HTTP</a>. What load balancing or degree of parallel serving of the requests is used is left up to the SUT.</p>
<p>The metric is expressed as queries per second, taking the total number of queries executed by completed browsing mixes and dividing this by the real time of the measurement window. The metric is called qpsSW, for <i>queries per second, socialweb</i>. The cost metric is $/qpsSW, calculated by the costing rules of the TPC. If compute-on-demand infrastructure is used, the costing will be $/qpsSW/day.</p>
<p>The test sponsor is the party contributing the result. The contribution consists of the metric and of a full disclosure report (FDR), written following a template given in the benchmark specification. The disclosure requirements follow the TPC practices, including publishing any configuration scripts, data definition language statements, timing for warm-up and test window, times for individual queries etc. All details of the hardware and software are disclosed.</p>
<h4>Test Support Software</h4>
<p>The software consists of the data generator and of a test driver. The test driver calls functions supplied by the test sponsor for performing the diverse operations in the test. Source code for any modifications of the test driver is to be published as part of the FDR.</p>
<h4>Rules for SUT</h4>
<p>Any hardware/software combination  â including single machines, clusters, clusters rented from computer providers like Amazon EC2 â is eligible.</p>
<p>The SUT must produce correct answers for the validation queries against the validation data set.</p>
<p>The implementation of the queries is not restricted. These can be any SPARQL or other queries, <a href="http://dbpedia.org/resource/Application_server" id="link-id0x98ff800">application server</a> based logic, stored procedures or other, in any language, provided full source code is provided in the FDR.</p>
<p>The data set is provided as serialized RDF. The means of storage are left up to the SUT. The basic intention is to use a triple store of some form, but the specific indexing, use of property tables, materialized views, and so forth, is left up to the test sponsor. All tuning and configuration is to be published in the FDR.</p>
<h4>Simulated Workload</h4>
<p>For each operation of each mix, the specification shall present:</p>
<ol>
 <li>
  <p>The logical intent of the operation, the business question, e.g., <i>What is the hot topic among my friends?</i>
  </p>
</li>
<li>
  <p>The question or update expressed in terms of the data in the data set.</p>
</li>
<li>
  <p>Sample text of a query answering the question or pseudo-code for deriving the answer.</p>
</li>
<li>
  <p>Result set layout, if applicable.</p>
</li>
</ol>
<p>The relative frequencies of the queries are given in the query mix summary.</p>
<h4>Browsing Mix</h4>
<p>The browsing mix consists of the following operations:</p>
<h5>Updates</h5>
<p></p>
<ul>
<li>
  <p>Make a <a href="http://dbpedia.org/resource/Blog" id="link-id0xbeb19e0">blog</a> post.</p>
</li>
<li>
  <p>Make a blog comment.</p>
</li>
<li>
  <p>Make a new social contact.</p>
</li>
</ul>
<p>For one new social contact, there are 10 posts and 20 comments.</p>
<h5>Queries</h5>
<ul>
 <li>
  <p>
    <i>What are the 10 most recent posts by somebody in my friends or their friends?</i> This would be a typical dashboard item.</p>
 </li>
<li>
  <p>
    <i>What are the authoritative bloggers on topic x?</i> This is a moderately complex ad-hoc query. Take posts tagged with the topic, count links to them, take the blogs containing them, show the 10 most cited blogs with the most recent posts with the <a href="http://dbpedia.org/resource/Tag" id="link-id0xb7b85f8">tag</a>. This would be typical of a stored query, like a parameterizable report.</p>
</li>
<li>
  <p>
    <i>How do I contact person x?</i> Calculate the chain of common acquaintances best for reaching person x. For practicality, we do not do a full walk of anything but just take the distinct persons in 2 steps of the user and in 2 steps of x and see the intersection.</p>
</li>
<li>
  <p>
    <i>Who are the people like me?</i> Find the top 10 people ranked by count of tags in common in the person&#39;s tag cloud. The tag cloud is the set of interests and the set of tags in blog posts of the person.</p>
</li>
<li>
  <p>
    <i>Who react to or talk about me?</i> Count of replies to material by the user, grouped by the commenting user and the site of the comment, top 20, sorted by count descending.</p>
</li>
<li>
  <p>
    <i>Who are my fans that I do not know?</i> Same as above, excluding people within 2 steps.</p>
</li>
<li>
  <p>
    <i>Who are my competitors?</i> Most prolific posters on topics of my interest that do not cite me.</p>
</li>
<li>
  <p>
    <i>Where is the action?</i> On forums where I participate, what are the top 5 threads, as measured by posts in the last day. Show count of posts in the last day and the day before that.</p>
</li>
<li>
  <p>
    <i>How do I get there? Who are the people active around both topic x and y?</i> This is defined by a person having participated during the last year in forums of x as well as of y. Forums are tagged by topics. The most active users are first. The ranking is proportional to the sum of the number of posts in x and y.</p>
</li>
</ul>
<h4>Analytic Mix</h4>
<p>These queries are typical questions about the state of the conversation space as a whole and can for example be published as a weekly summary page.</p>
<ul>
<li>
  <p>
    <b>The fastest propagating idea</b> - <i>What is the topic with the most users who have joined in the last day?</i> A user is considered to have joined if the user was not discussing this in the past 10 days.</p>
</li>
<li>
  <p>
    <b>Prime movers</b> - <i>What users start conversations?</i> A conversation is the set of material in reply to or citing a post. The reply distance can be arbitrarily long, the citing distance is a direct link to the original post or a reply there to. The number and extent of conversations contribute towards the score.</p>
</li>
<li>
  <p>
    <b>Geography</b> - Over the last 10 days, for each geographic area, show the top 50 tags. The location is the location of the poster.</p>
</li>
<li>
  <p>
    <b>Social hubs</b> - For each community, get the top 5 people who are central to it in terms of number of links to other members of the same community and in terms of being linked from posts. A community is the set of forums that have a specific topic.</p>
</li>
</ul>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2007-11-08#1269">
  <rss:title>Social Web RDF Store Benchmark</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-11-08T13:39:39Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Elaborating on my previous post, as food for thought for an RDF store benchmarking activity under the W3C, I present the following rough sketch. At the end of the below, I propose some common business questions that should be answered by a social web aggregator. The problem with these is that it is not really possible to ask interesting questions over a large database without involving some sort of counting and grouping. I feel that we simply cannot make a representative benchmark without these, quite regardless of the fact that SPARQL in its present form does not have these features. Hence I have simply stated the questions and left any implementation open. If this seems like an interesting direction, the nascent W3C benchmarking XG (experimental group) can refine the business questions, relative query frequencies, exact data set composition, etc. Social Web RDF Benchmark by Orri Erling Goals This benchmark model&#39;s use of RDF for representing and analyzing use of social software by user communities. The benchmark consists of a scalable synthetic data set, a feed of updates to the data set, and a query mix. The data set reflects the common characteristics of the social web, with realistic distribution of connections, user contributed content, commenting, tagging, and other social web activities. The data set is expressed in the FOAF and SIOC vocabularies. The query mix is divided between relatively short, dashboard or search engine style lookups, and longer running analytics queries. The system being modeled is an an aggregator of social web content; we could liken it to an RDF-based Technorati with some extra features. Users can publish their favorite queries or mesh-ups as logical views served by the system. In this manner, queries come to depend on other queries, somewhat like SQL VIEWs can reference each other. There is a small qualification data set that can be tested against the queries to validate that the system under test (SUT) produces the correct results. The benchmark is scaled by number of users. To facilitate comparison, some predefined scales are offered, i.e., 100K, 300K, 1M, 3M, 10M users. Each simulated user both produces and consumes content. The level of activity of users is unevenly divided. There are two work mixes â the browsing mix, which consists of a mix of lookups and contributing content, and the analytics mix, which consists of long-running queries for tracking the state of the network. For each 100 browsing mixes, one analytics mix is performed. A benchmark run is at least 1h real-time in duration. The metric is calculated by the number of browsing mixes completed during the test window. This simulates 10% of the users being online at any one time, thus for a scale of 1M users, 100K browsing mixes will be simultaneously proceeding. The test driver submits the work via HTTP. What load balancing or degree of parallel serving of the requests is used is left up to the SUT. The metric is expressed as queries per second, taking the total number of queries executed by completed browsing mixes and dividing this by the real time of the measurement window. The metric is called qpsSW, for queries per second, socialweb. The cost metric is $/qpsSW, calculated by the costing rules of the TPC. If compute-on-demand infrastructure is used, the costing will be $/qpsSW/day. The test sponsor is the party contributing the result. The contribution consists of the metric and of a full disclosure report (FDR), written following a template given in the benchmark specification. The disclosure requirements follow the TPC practices, including publishing any configuration scripts, data definition language statements, timing for warm-up and test window, times for individual queries etc. All details of the hardware and software are disclosed. Test Support Software The software consists of the data generator and of a test driver. The test driver calls functions supplied by the test sponsor for performing the diverse operations in the test. Source code for any modifications of the test driver is to be published as part of the FDR. Rules for SUT Any hardware/software combination â including single machines, clusters, clusters rented from computer providers like Amazon EC2 â is eligible. The SUT must produce correct answers for the validation queries against the validation data set. The implementation of the queries is not restricted. These can be any SPARQL or other queries, application server based logic, stored procedures or other, in any language, provided full source code is provided in the FDR. The data set is provided as serialized RDF. The means of storage are left up to the SUT. The basic intention is to use a triple store of some form, but the specific indexing, use of property tables, materialized views, and so forth, is left up to the test sponsor. All tuning and configuration is to be published in the FDR. Simulated Workload For each operation of each mix, the specification shall present: The logical intent of the operation, the business question, e.g., What is the hot topic among my friends? The question or update expressed in terms of the data in the data set. Sample text of a query answering the question or pseudo-code for deriving the answer. Result set layout, if applicable. The relative frequencies of the queries are given in the query mix summary. Browsing Mix The browsing mix consists of the following operations: Updates Make a blog post. Make a blog comment. Make a new social contact. For one new social contact, there are 10 posts and 20 comments. Queries What are the 10 most recent posts by somebody in my friends or their friends? This would be a typical dashboard item. What are the authoritative bloggers on topic x? This is a moderately complex ad-hoc query. Take posts tagged with the topic, count links to them, take the blogs containing them, show the 10 most cited blogs with the most recent posts with the tag. This would be typical of a stored query, like a parameterizable report. How do I contact person x? Calculate the chain of common acquaintances best for reaching person x. For practicality, we do not do a full walk of anything but just take the distinct persons in 2 steps of the user and in 2 steps of x and see the intersection. Who are the people like me? Find the top 10 people ranked by count of tags in common in the person&#39;s tag cloud. The tag cloud is the set of interests and the set of tags in blog posts of the person. Who react to or talk about me? Count of replies to material by the user, grouped by the commenting user and the site of the comment, top 20, sorted by count descending. Who are my fans that I do not know? Same as above, excluding people within 2 steps. Who are my competitors? Most prolific posters on topics of my interest that do not cite me. Where is the action? On forums where I participate, what are the top 5 threads, as measured by posts in the last day. Show count of posts in the last day and the day before that. How do I get there? Who are the people active around both topic x and y? This is defined by a person having participated during the last year in forums of x as well as of y. Forums are tagged by topics. The most active users are first. The ranking is proportional to the sum of the number of posts in x and y. Analytic Mix These queries are typical questions about the state of the conversation space as a whole and can for example be published as a weekly summary page. The fastest propagating idea - What is the topic with the most users who have joined in the last day? A user is considered to have joined if the user was not discussing this in the past 10 days. Prime movers - What users start conversations? A conversation is the set of material in reply to or citing a post. The reply distance can be arbitrarily long, the citing distance is a direct link to the original post or a reply there to. The number and extent of conversations contribute towards the score. Geography - Over the last 10 days, for each geographic area, show the top 50 tags. The location is the location of the poster. Social hubs - For each community, get the top 5 people who are central to it in terms of number of links to other members of the same community and in terms of being linked from posts. A community is the set of forums that have a specific topic.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Elaborating on <a href="http://www.openlinksw.com/dataspace/oerling/weblog/Orri%20Erling%27s%20Blog/1269" id="link-idfe9e1d8">my previous post</a>, as food for thought for an <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1d1e1468">RDF</a> store benchmarking activity under the W3C, I present the following rough sketch. At the end of the below, I propose some common business questions that should be answered by a social web aggregator.</p>
<p>The problem with these is that it is not really possible to ask interesting questions over a large database without involving some sort of counting and grouping. I feel that we simply cannot make a representative benchmark without these, quite regardless of the fact that <a href="http://dbpedia.org/resource/SPARQL" id="link-id0xba84830">SPARQL</a> in its present form does not have these features. Hence I have simply stated the questions and left any implementation open. If this seems like an interesting direction, the nascent W3C benchmarking XG (experimental group) can refine the business questions, relative query frequencies, exact <a href="http://dbpedia.org/resource/Data" id="link-id0x1c272b10">data</a> set composition, etc.</p>
<h3>Social Web RDF Benchmark </h3>
<p>
<i>by Orri Erling</i>
</p>
<h4>Goals</h4>
<p>This benchmark model&#39;s use of RDF for representing and analyzing use of social software by user communities. The benchmark consists of a scalable synthetic data set, a feed of updates to the data set, and a query mix. The data set reflects the common characteristics of the social web, with realistic distribution of connections, user contributed content, commenting, tagging, and other social web activities. The data set is expressed in the FOAF and SIOC vocabularies. The query mix is divided between relatively short, dashboard or search engine style lookups, and longer running analytics queries.</p>
<p>The system being modeled is an an aggregator of social web content; we could liken it to an RDF-based Technorati with some extra features.</p>
<p>Users can publish their favorite queries or mesh-ups as logical views served by the system. In this manner, queries come to depend on other queries, somewhat like <a href="http://dbpedia.org/resource/SQL" id="link-id0xb75c930">SQL</a> VIEWs can reference each other.</p>
<p>There is a small qualification data set that can be tested against the queries to validate that the system under test (SUT) produces the correct results.</p>
<p>The benchmark is scaled by number of users. To facilitate comparison, some predefined scales are offered, i.e., 100K, 300K, 1M, 3M, 10M users. Each simulated user both produces and consumes content. The level of activity of users is unevenly divided.</p>
<p>There are two work mixes â the browsing mix, which consists of a mix of lookups and contributing content, and the analytics mix, which consists of long-running queries for tracking the state of the network. For each 100 browsing mixes, one analytics mix is performed.</p>
<p>A benchmark run is at least 1h real-time in duration. The metric is calculated by the number of browsing mixes completed during the test window. This simulates 10% of the users being online at any one time, thus for a scale of 1M users, 100K browsing mixes will be simultaneously proceeding.</p>
<p>The test driver submits the work via <a href="http://dbpedia.org/resource/Hypertext_Transfer_Protocol" id="link-id0x1ae7c010">HTTP</a>. What load balancing or degree of parallel serving of the requests is used is left up to the SUT.</p>
<p>The metric is expressed as queries per second, taking the total number of queries executed by completed browsing mixes and dividing this by the real time of the measurement window. The metric is called qpsSW, for <i>queries per second, socialweb</i>. The cost metric is $/qpsSW, calculated by the costing rules of the TPC. If compute-on-demand infrastructure is used, the costing will be $/qpsSW/day.</p>
<p>The test sponsor is the party contributing the result. The contribution consists of the metric and of a full disclosure report (FDR), written following a template given in the benchmark specification. The disclosure requirements follow the TPC practices, including publishing any configuration scripts, data definition language statements, timing for warm-up and test window, times for individual queries etc. All details of the hardware and software are disclosed.</p>
<h4>Test Support Software</h4>
<p>The software consists of the data generator and of a test driver. The test driver calls functions supplied by the test sponsor for performing the diverse operations in the test. Source code for any modifications of the test driver is to be published as part of the FDR.</p>
<h4>Rules for SUT</h4>
<p>Any hardware/software combination  â including single machines, clusters, clusters rented from computer providers like Amazon EC2 â is eligible.</p>
<p>The SUT must produce correct answers for the validation queries against the validation data set.</p>
<p>The implementation of the queries is not restricted. These can be any SPARQL or other queries, <a href="http://dbpedia.org/resource/Application_server" id="link-id0x1a38aee0">application server</a> based logic, stored procedures or other, in any language, provided full source code is provided in the FDR.</p>
<p>The data set is provided as serialized RDF. The means of storage are left up to the SUT. The basic intention is to use a triple store of some form, but the specific indexing, use of property tables, materialized views, and so forth, is left up to the test sponsor. All tuning and configuration is to be published in the FDR.</p>
<h4>Simulated Workload</h4>
<p>For each operation of each mix, the specification shall present:</p>
<ol>
 <li>
  <p>The logical intent of the operation, the business question, e.g., <i>What is the hot topic among my friends?</i>
  </p>
</li>
<li>
  <p>The question or update expressed in terms of the data in the data set.</p>
</li>
<li>
  <p>Sample text of a query answering the question or pseudo-code for deriving the answer.</p>
</li>
<li>
  <p>Result set layout, if applicable.</p>
</li>
</ol>
<p>The relative frequencies of the queries are given in the query mix summary.</p>
<h4>Browsing Mix</h4>
<p>The browsing mix consists of the following operations:</p>
<h5>Updates</h5>
<p></p>
<ul>
<li>
  <p>Make a <a href="http://dbpedia.org/resource/Blog" id="link-id0x1e0f6470">blog</a> post.</p>
</li>
<li>
  <p>Make a blog comment.</p>
</li>
<li>
  <p>Make a new social contact.</p>
</li>
</ul>
<p>For one new social contact, there are 10 posts and 20 comments.</p>
<h5>Queries</h5>
<ul>
 <li>
  <p>
    <i>What are the 10 most recent posts by somebody in my friends or their friends?</i> This would be a typical dashboard item.</p>
 </li>
<li>
  <p>
    <i>What are the authoritative bloggers on topic x?</i> This is a moderately complex ad-hoc query. Take posts tagged with the topic, count links to them, take the blogs containing them, show the 10 most cited blogs with the most recent posts with the <a href="http://dbpedia.org/resource/Tag" id="link-id0xbf5ace8">tag</a>. This would be typical of a stored query, like a parameterizable report.</p>
</li>
<li>
  <p>
    <i>How do I contact person x?</i> Calculate the chain of common acquaintances best for reaching person x. For practicality, we do not do a full walk of anything but just take the distinct persons in 2 steps of the user and in 2 steps of x and see the intersection.</p>
</li>
<li>
  <p>
    <i>Who are the people like me?</i> Find the top 10 people ranked by count of tags in common in the person&#39;s tag cloud. The tag cloud is the set of interests and the set of tags in blog posts of the person.</p>
</li>
<li>
  <p>
    <i>Who react to or talk about me?</i> Count of replies to material by the user, grouped by the commenting user and the site of the comment, top 20, sorted by count descending.</p>
</li>
<li>
  <p>
    <i>Who are my fans that I do not know?</i> Same as above, excluding people within 2 steps.</p>
</li>
<li>
  <p>
    <i>Who are my competitors?</i> Most prolific posters on topics of my interest that do not cite me.</p>
</li>
<li>
  <p>
    <i>Where is the action?</i> On forums where I participate, what are the top 5 threads, as measured by posts in the last day. Show count of posts in the last day and the day before that.</p>
</li>
<li>
  <p>
    <i>How do I get there? Who are the people active around both topic x and y?</i> This is defined by a person having participated during the last year in forums of x as well as of y. Forums are tagged by topics. The most active users are first. The ranking is proportional to the sum of the number of posts in x and y.</p>
</li>
</ul>
<h4>Analytic Mix</h4>
<p>These queries are typical questions about the state of the conversation space as a whole and can for example be published as a weekly summary page.</p>
<ul>
<li>
  <p>
    <b>The fastest propagating idea</b> - <i>What is the topic with the most users who have joined in the last day?</i> A user is considered to have joined if the user was not discussing this in the past 10 days.</p>
</li>
<li>
  <p>
    <b>Prime movers</b> - <i>What users start conversations?</i> A conversation is the set of material in reply to or citing a post. The reply distance can be arbitrarily long, the citing distance is a direct link to the original post or a reply there to. The number and extent of conversations contribute towards the score.</p>
</li>
<li>
  <p>
    <b>Geography</b> - Over the last 10 days, for each geographic area, show the top 50 tags. The location is the location of the poster.</p>
</li>
<li>
  <p>
    <b>Social hubs</b> - For each community, get the top 5 people who are central to it in terms of number of links to other members of the same community and in terms of being linked from posts. A community is the set of forums that have a specific topic.</p>
</li>
</ul>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-09-22#1261">
  <rss:title>Fourth Platform: Data Spaces in The Cloud (Update)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-09-22T23:43:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I&#39;ve written extensively on the subject of Data Spaces in relation to the Data Web for while. I&#39;ve also written sparingly about OpenLink Data Spaces (a Data Web Platform that build using Virtuoso). On the other hand, I haven&#39;t shed much light on installation and deployment of OpenLink Data Spaces. Jon Udell recently penned a post titled: The Fourth Platform. The post arrives at a spookily coincidental time (this happens quite often between Jon and I as demonstrated last year during our podcast; the &quot;Fourth&quot; in his Innovators Podcast series). The platform that Jon describes is &quot;Cloud Based&quot; and comprised of Storage and Computation. I would like to add Data Access and Management (native and virtual) under the fourth platform banner with the end product called: &quot;Cloud based Data Spaces&quot;. As I write, we are releasing a Virtuoso AMI (Amazon Image) labeled: virtuoso-dataspace-server. This edition of Virtuoso includes the OpenLink Data Spaces Layer and all of the OAT applications we&#39;ve been developing for a while. What Benefits Does this offer? Personal Data Spaces in the Cloud - a place where you can control and consolidate data across your Blogs, Wikis, RSS/Atom Feed Subscriptions, Shared Bookmarks, Shared Calendars, Discussion Threads, Photo Galleries etc All the data in your Data Space is SPARQL or GData accessible. All of the data in your Personal Data Space is Linked Data from the get go. Each Item of data is URI addressable SIOC support - your Blogs, Wikis, Bookmarks etc.. are based on the SIOC ontology for Semantically Interlinking Online Communities (think: Open social-graph++) FOAF support - your FOAF Profile page provides a URI that is an in-road to all Data in your Data Space. OpenID support - your Personal Data Space ID is usable wherever OpenID is supported. OpenID and FOAF are integrated as per latest FOAF specs Two Integration with Facebook - You can access your Data Space from Facebook or access Facebook from your Data Space Unified Storage - The WebDAV based filesystem provides Cloud Storage that&#39;s integrated with Amazon S3; It also exposes all of your Data Space data via a traditional filesystem UI (think virtual Spotlight); You can also mount this drive to your local filesystem via your native operating system&#39;s WebDAV support SyncML - you can sync calendar and contact details with your Data Space in the cloud from your Mobile phone. A practical Semantic Data Web solution - based on Web Infrastructure and doesn&#39;t require you to do anything beyond exposing URIs for data in your Data Spaces. EC2-AMI Details: AMI ID: ami-e2ca2f8b Manifest file: virtuoso-images/virtuoso-dataspace-server.manifest.xml Installation Guide: Get an Amazon Web Services (AWS) account Signup for S3 and EC2 services Install the EC2 plugin for Firefox Start the EC2 plugin Locate the row containingÂ ami-7c31d515Â Â ManifestÂ virtuoso-test/virtuoso-cloud-beta-9-i386.manifest.xmlÂ (sort using the AMI ID or Manifest Columns or search on pattern: virtuoso, due to name flux) Start the Virtuoso Data Space Server AMI Wait 4-5 minutes (*take a few minutes to create the pre-configured Linux Image*) Connect to http://http://your-ec2-instance-cname:8890/ Log in with user/password dba/dba Go to the Admin UI (Virtuoso Conductor) and change the PWDs for the &#39;dba&#39; and &#39;dav&#39; accounts (*Important!*) Give the &quot;SPARQL&quot; user &quot;SPARQL_UPDATE&quot; privileges (required if you want to exploit the in-built Sponger Middleware) Click on the ODS (OpenLink Data Spaces) link to start an Personal Editon of OpenLink Data Spaces (or go to: http://your-ec2-instance-cname/dataspace/ods/index.html) Log-in using the username and password credentials for the &#39;dav&#39; account (or register a new user note: OpenID is an option here also) Create an Data Space Application Instance by clicking on a Data Space App. Tab Import data from your existing Web 2.0 style applications into OpenLink Data Spaces e.g. subscribe to a few RSS/Atom feeds via the &quot;Feeds Manager&quot; application or import some Bookmarks using the &quot;Bookmarks&quot; application Then look at the imported data in Linked Data form via your ODS generated URIs based on the patterns: http://your-ec2-instance-cname/dataspace/person/your-ods-id#this (URI for You the Person), http://your-ec2-instance-cname/dataspace/person/your-ods-id (FOAF File URI), http://your-ec2-instance-cname/dataspace/your-ods-id (SIOC File URI) (OAT) from your Data Space instanceInstall the OAT VAD package via the Admin UI and then apply the URI patterns below within your browser: http://:8890/oatdemo - Entire OAT Demo Collection http://:8890/rdfbrowser - RDF Browser http://:8890/isparql - SPARQL Query Builder (iSPARQL) http://:8890/qbe - SQL Query Builder (iSQL) http://:8890/formdesigner - Forms Builder (for building Meshups based on RDF, SQL, or Web Servives Data Souces) http://:8890/dbdesigner - SQL DB Schema Designer (note a Visual SQL-RDF Mapper is also on it&#39;s way http://:8890/DAV/JS/ - To view the OAT Tree (there are some experimental demos that are missing from the main demo app etc..) There&#39;s more to come!</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p>I&#39;ve written extensively on the subject of <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&amp;q=data%20spaces&amp;type=text&amp;output=html" id="link-id134c2280">Data Spaces</a> in relation to the <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&amp;q=data%20web%0D%0A&amp;type=text&amp;output=html" id="link-id105aef90">Data Web</a> for while. I&#39;ve also written sparingly about <a href="http://virtuoso.openlinksw.com/wiki/main/Main/OdsIndex" id="link-id105bd100">OpenLink Data Spaces</a> (a Data Web Platform that build using Virtuoso). On the other hand, I haven&#39;t shed much light on installation and deployment of OpenLink Data Spaces.</p> <p> <a href="http://blog.jonudell.net" id="link-id14347f20">Jon Udell</a> recently penned a post titled: <a href="http://blog.jonudell.net/2007/09/21/the-fourth-platform/" id="link-id1439ed48">The Fourth Platform</a>. The post arrives at a spookily coincidental time (this happens quite often between Jon and I as demonstrated last year during our <a href="http://weblog.infoworld.com/udell/gems/ju_idehen.mp3" id="link-id107d17a8">podcast</a>; the &quot;Fourth&quot; in his Innovators Podcast series).</p> <p>The platform that Jon describes is &quot;Cloud Based&quot; and comprised of Storage and Computation. I would like to add Data Access and Management (native and virtual) under the fourth platform banner with the end product called: &quot;Cloud based Data Spaces&quot;. </p> <p>As I write, we are releasing a Virtuoso AMI (Amazon Image) labeled: virtuoso-dataspace-server. This edition of<a href="http://virtuoso.openlinksw.com" id="link-id13543210"> Virtuoso</a> includes the OpenLink Data Spaces Layer and all of the OAT applications we&#39;ve been developing for a while.</p> <h2>What Benefits Does this offer?</h2> <ol> <li>Personal Data Spaces in the Cloud - a place where you can control and consolidate data across your Blogs, Wikis, RSS/Atom Feed Subscriptions, Shared Bookmarks, Shared Calendars, Discussion Threads, Photo Galleries etc</li> <li>All the data in your Data <a href="http://en.wikipedia.org/wiki/Data_Spaces">Space</a> is <a href="http://dbpedia.org/resource/SPARQL" id="link-id1149a4f8">SPARQL</a> or <a href="http://dbpedia.org/resource/GData" id="link-id107a9f28">GData</a> accessible.</li> <li>All of the data in your Personal Data Space is <a href="http://dbpedia.org/resource/Linked_Data">Linked Data</a> from the get go. Each Item of data is <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier">URI</a> addressable</li> <li> <a href="http://dbpedia.org/resource/SIOC" id="link-id104f4160">SIOC</a> support - your Blogs, Wikis, Bookmarks etc.. are based on the SIOC ontology for Semantically Interlinking Online Communities (think: Open social-graph++) </li> <li> <a href="http://dbpedia.org/resource/Friend_of_a_friend" id="link-id105beb78">FOAF</a> support - your FOAF Profile page provides a URI that is an in-road to all Data in your Data Space.</li> <li> <a href="http://dbpedia.org/resource/OpenID" id="link-id1144e138">OpenID</a> support - your Personal Data Space ID is usable wherever OpenID is supported. OpenID and FOAF are integrated as per latest FOAF specs</li> <li>Two Integration with Facebook - You can access your Data Space from Facebook or access Facebook from your Data Space</li> <li>Unified Storage - The WebDAV based filesystem provides Cloud Storage that&#39;s integrated with Amazon S3; It also exposes all of your Data Space data via a traditional filesystem UI (think virtual Spotlight); You can also mount this drive to your local filesystem via your native operating system&#39;s WebDAV support</li> <li> <a href="http://dbpedia.org/resource/SyncML" id="link-id11128f48">SyncML</a> - you can sync calendar and contact details with your Data Space in the cloud from your Mobile phone.</li> <li>A practical Semantic Data Web solution - based on Web Infrastructure and doesn&#39;t require you to do anything beyond exposing URIs for data in your Data Spaces.</li> </ol> <h2> <a href="http://dbpedia.org/resource/Amazon_Elastic_Compute_Cloud" id="link-id115d1920">EC2</a>-AMI Details:</h2> <ul>AMI ID: ami-e2ca2f8b</ul> <ul>Manifest file: virtuoso-images/virtuoso-dataspace-server.manifest.xml</ul> <h2>Installation Guide:</h2> <ol> <li>Get an Amazon Web Services (AWS) account</li> <li>Signup for S3 and EC2 services</li> <li>Install the EC2 plugin for Firefox</li> <li>Start the EC2 plugin</li> <li>Locate the row containingÂ <b>ami-7c31d515Â Â ManifestÂ virtuoso-test/virtuoso-cloud-beta-9-i386.manifest.xmlÂ </b>(sort using the AMI ID or Manifest Columns or search on pattern: virtuoso, due to name flux)</li> <li>Start the Virtuoso Data Space Server AMI</li> <li>Wait 4-5 minutes (*take a few minutes to create the pre-configured Linux Image*)</li> <li>Connect to http://<public_dns_name_of_your_instance>http://your-ec2-instance-cname:8890/ Log in with user/password dba/dba</public_dns_name_of_your_instance> </li> <li>Go to the Admin UI (Virtuoso Conductor) and change the PWDs for the &#39;dba&#39; and &#39;dav&#39; accounts (*Important!*)</li> <li>Give the &quot;SPARQL&quot; user &quot;SPARQL_UPDATE&quot; privileges (required if you want to exploit the in-built Sponger Middleware)</li> <li>Click on the <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces">ODS</a> (OpenLink Data Spaces) link to start an Personal Editon of OpenLink Data Spaces (or go to: http://your-ec2-instance-cname/dataspace/ods/index.html)</li> <li>Log-in using the username and password credentials for the &#39;dav&#39; account (or register a new user note: OpenID is an option here also) Create an Data Space Application Instance by clicking on a Data Space App. Tab</li> <li>Import data from your existing Web 2.0 style applications into OpenLink Data Spaces e.g. subscribe to a few RSS/Atom feeds via the &quot;Feeds Manager&quot; application or import some Bookmarks using the &quot;Bookmarks&quot; application</li> <li>Then look at the imported data in Linked Data form via your ODS generated URIs based on the patterns: http://your-ec2-instance-cname/dataspace/person/your-ods-id#this (URI for You the Person), http://your-ec2-instance-cname/dataspace/person/your-ods-id (FOAF File URI), http://your-ec2-instance-cname/dataspace/your-ods-id (SIOC File URI)<br /> </li> </ol> <h2> (OAT) from your Data Space instance</h2>Install the OAT VAD package via the Admin UI and then apply the URI patterns below within your browser:<br /> <ol> <li>http://<public_dns_name_of_your_instance>:8890/oatdemo - Entire OAT Demo Collection</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/rdfbrowser - RDF Browser</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/isparql - SPARQL Query Builder (iSPARQL)</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/qbe - SQL Query Builder (iSQL)</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/formdesigner - Forms Builder (for building Meshups based on RDF, SQL, or Web Servives Data Souces)</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/dbdesigner - SQL DB Schema Designer (note a Visual SQL-RDF Mapper is also on it&#39;s way</public_dns_name_of_your_instance> </li> <li>http://<public_dns_name_of_your_instance>:8890/DAV/JS/ - To view the OAT Tree (there are some experimental demos that are missing from the main demo app etc..) </public_dns_name_of_your_instance> </li> </ol> <p>There&#39;s more to come!</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-09-03#1249">
  <rss:title>Yet Another RDFa Demo</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-09-03T17:59:02Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Ivan Herman just posted another nice example of practical RDFa usage in a blog post titled: Yet Another RDFa Proccessor. In his post, Ivan exposes a URI for his FOAF-in-RDFa file. Since I am aggressively tracking RDFa developments, I decided to quickly view Ivan&#39;s FOAF-in-RDFa file via the OpenLink RDF Browser. The full implications are best understood when you click on each of the Browser&#39;s Tabs -- each providing a different perspective on this interesting addition to the Semantic Data Web (note: the Fresnel Tab which demonstrates declarative UI templating using N3). What&#39;s Going on Here? The OpenLink RDF Browser is a Rich Internet Application built using OAT (OpenLink Ajax Toolkit). In my case, I am deploying the RDF Browser from a Virtuoso instance, which implies that the Browser is able to use the Virtuoso Sponger Middleware (exposed as a REST Service at the Virtuoso instance endpoint: /proxy); which includes an RDFa Cartridge comprised of a metadata extractor and an RDF Schema / OWL Ontology mapper. That&#39;s it!</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://www.ivan-herman.net/Ivan_Herman">Ivan Herman</a> just posted another nice example of practical <a href="http://dbpedia.org/resource/RDFa">RDFa</a> usage in a blog post titled: <a href="http://ivanherman.wordpress.com/2007/09/03/yet-another-rdfa-processor…/">Yet Another RDFa Proccessor</a>. In his post, Ivan exposes a <a href="http://dbpedia.org/resource/Uniform_Resource_Identifier">URI</a> for his<a href="http://www.ivan-herman.net/foaf.html"> FOAF-in-RDFa file</a>.</p>

<p>Since I am <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/weblog/kidehen@openlinksw.com%27s%20BLOG%20%5B127%5D/1243">aggressively tracking RDFa developments</a>, I decided to quickly view <a href="http://demo.openlinksw.com/DAV/home/demo/Public/Queries/DataWeb/ivan_herman_foaf_via_rdfa.wqx">Ivan&#39;s FOAF-in-RDFa file via the OpenLink RDF Browser</a>. The full implications are best understood when you click on each of the Browser&#39;s Tabs -- each providing a different perspective on this interesting addition to the Semantic Data Web (note: the <a href="http://www.w3.org/2005/04/fresnel-info/">Fresnel</a> Tab which demonstrates declarative UI templating using N3).</p>

<h3>What&#39;s Going on Here?</h3>
<p>The <a href="http://demo.openlinksw.com/DAV/JS/rdfbrowser/index.html">OpenLink RDF Browser</a> is a <a href="http://en.wikipedia.org/wiki/Rich_internet_application">Rich Internet Application</a> built using OAT (<a href="http://oat.openlinksw.com">OpenLink Ajax Toolkit</a>). In my case, I am deploying the RDF Browser from a <a href="http://virtuoso.openlinksw.com">Virtuoso</a> instance, which implies that the Browser is able to use the <a href="http://www.openlinksw.com/blog/~kidehen/?id=1172">Virtuoso Sponger</a> Middleware (exposed as a REST Service at the Virtuoso instance endpoint: /proxy); which includes an RDFa Cartridge comprised of a metadata extractor and an <a href="http://dbpedia.org/resource/RDF_Schema">RDF Schema</a> / <a href="http://dbpedia.org/resource/Web_Ontology_Language">OWL Ontology</a> mapper. That&#39;s it!</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-06-14#1224">
  <rss:title>Enterprise 0.0, Linked Data, and Semantic Data Web</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-06-14T15:28:26Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Last week we officially released Virtuoso 5.0.1 (in Commercial and Open Source Editions). The press release provided us with an official mechanism and timestamp for the current Virtuoso feature set. A vital component of the new Virtuoso release is the finalization of our SQL to RDF mapping functionality -- enabling the declarative mapping of SQL Data to RDF. Additional technical insight covering other new features (delivered and pending) is provided by Orri Erling, as part of a series of post-Banff posts. Why is SQL to RDF Mapping a Big Deal? A majority of the world&#39;s data (especially in the enterprise realm) resides in SQL Databases. In addition, Open Access to the data residing in said databases remains the biggest challenge to enterprises for the following reasons: SQL Data Sources are inherently heterogeneous because they are acquired with business applications that are in many cases inextricably bound to a particular DBMS engine Data is predictably dirty DBMS vendors ultimately hold the data captive and have traditionally resisted data access standards such as ODBC (*trust me they have, just look at the unprecedented bad press associated with ODBC the only truly platform independent data access API. Then look at how this bad press arose..*) Enterprises have known from the beginning of modern corporate times that data access, discovery, and manipulation capabilities are inextricably linked to the &quot;Real-time Enterprise&quot; nirvana (hence my use of 0.0 before this becomes 3.0). In my experience, as someone whose operated in the data access and data integration realms since the late &#39;80s, I&#39;ve painfully observed enterprises pursue, but unsuccessfully attain, full control over enterprise data (the prized asset of any organization) such that data-, information-, knowledge-workers are just a click away from commencing coherent platform and database independent data drill-downs and/or discovery that transcend intranet, internet, and extranet boundaries -- serendipitous interaction with relevant data, without compromise! Okay, situation analysis done, we move on.. At our most recent (12th June) monthly Semantic Web Gathering, I unveiled to TimBL and a host of other attendees a simple, but powerful, demonstration of how Linked Data, as an aspect of the Semantic Data Web, can be applied to enterprise data integration challenges. Actual SQL to RDF Mapping Demo / Experiment Hypothesis A SQL Schema can be effectively mapped declaratively to RDF such that SQL Rows morph into RDF Instance Data (Entity Sets) based on the Concepts &amp; Properties defined in a Concrete Conceptual Data Model oriented Data Dictionary (RDF Schema and/or OWL Ontology). In addition, the solution must demonstrate how &quot;Linked Data in the Web&quot; is completely different from &quot;Data on the Web&quot; or &quot;Linked Data on the Web&quot; (btw - Tom Heath eloquently unleashed this point in his recent podcast interview with Talis). Apparatus An Ontology - in this case we simply derived the Northwind Ontology from the XML Schema based CSDL (Conceptual Schema Definition Language) used by Microsoft&#39;s public Astoria demo (specifically the Northwind Data Services demo). SQL Database Schema - Northwind (comes bundled with ACCESS, SQL Server, and Virtuoso) comprised of tables such as: Customer, Employee, Product, Category, Supplier, Shipper etc. OpenLink Virtuoso - SQL DBMS Engine (although this could have been any ODBC or JDBC accessible Database), SQL-RDF Metaschema Language, HTTP URL-rewriter, WebDAV Engine, and DBMS hosted XSLT processor Client Tools - iSPARQL Query Builder, RDF Browser (which could also have been Tabulator or DISCO or a standard Web Browser) Experiment / Demo Declaratively map the Northwind SQL Schema to RDF using the Virtuoso Meta Schema Language (see: Virtuoso PL based Northwind_SQL_RDF script) Start browsing the data by clicking on the URIs that represent the RDF Data Model Entities resulting from the SQL to RDF Mapping Observations Via a single Data Link click I was able to obtain specific information about the Customer represented by the URI &quot;ALFKI&quot; (act of URI Dereferencing as you would an Object ID in an Object or Object-Relational Database) Via a Dynamic Data Page I was able to explore all the entity relationships or specific entity data (i.e Exploratory or Entity specific dereferencing) in the Northwind Data Space I was able to perform similar exploration (as per item 2) using our OpenLink Browser. Conclusions The vision of data, information, or knowledge at your fingertips is nigh! Thanks to the infrastructure provided by the Semantic Data Web (URIs, RDF Data Model, variety of RDF Serialization Formats[1][2][3], and Shared Data Dictionaries / Schemas / Ontologies [1][2][3][4][5]) it&#39;s now possible to Virtualize enterprise data from the Physical Storage Level, through the Logical Data Management Levels (Relational), up to a Concrete Conceptual Model (Graph) without operating system, development environment or framework, or database engine lock-in. Next Steps We produce a shared ontology for the CRM and Business Reporting Domains. I hope this experiment clarifies how this is quite achievable by converting XML Schemas to RDF Data Dictionaries (RDF Schemas or Ontologies). Stay tuned :-) Also watch TimBL amplify and articulate Linked Data value in a recent interview. Other Related Matters To deliver a mechanism that facilitates the crystallization of this reality is a contribution of boundless magnitude (as we shall all see in due course). Thus, it is easy to understand why even &quot;her majesty&quot;, the queen of England, simply had to get in on the act and appoint TimBL to the &quot;British Order of Merit&quot; :-) Note: All of the demos above now work with IE &amp; Safari (a &quot;remember what Virtuoso is epiphany&quot;) by simply putting Virtuoso&#39;s DBMS hosted XSLT engine to use :-) This also applies to my earlier collection of demos from the Hello Data Web and other Data Web &amp; Linked Data related demo style posts.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Last week we <a href="http://www.openlinksw.com/press/virt_501.htm">officially released Virtuoso 5.0.1</a> (in Commercial and Open Source Editions). The press release provided us with an official mechanism and timestamp for the current Virtuoso feature set.</p> 
<p>A vital component of the new Virtuoso release is the finalization of our SQL to RDF mapping functionality -- enabling the declarative mapping of SQL Data to RDF. Additional technical insight covering other new features (delivered and pending) is provided by <a href="http://www.openlinksw.com/weblogs/oerling/">Orri Erling</a>, as part of a series of post-Banff posts.</p>

<h2>Why is SQL to RDF Mapping a Big Deal?</h2>

<p>A majority of the world&#39;s data (especially in the enterprise realm) resides in SQL Databases. In addition, Open Access to the data residing in said databases remains the biggest challenge to enterprises for the following reasons:</p>
<ol>
<li>
SQL Data Sources are inherently heterogeneous because they are acquired with business applications that are in many cases inextricably bound to a particular DBMS engine
</li>
<li>
Data is predictably dirty
</li>
<li>
DBMS vendors ultimately hold the data captive and have traditionally resisted data access standards such as ODBC (*trust me they have, just look at the unprecedented bad press associated with ODBC the only truly platform independent data access API. Then look at how this bad press arose..*)
</li>
</ol>

<p>
Enterprises have known from the beginning of modern corporate times that data access, discovery, and manipulation capabilities are inextricably linked to the &quot;Real-time Enterprise&quot; nirvana (hence my use of 0.0 before this becomes 3.0).</p>
<p>In my experience, as someone whose operated in the data access and data integration realms since the late &#39;80s, I&#39;ve painfully observed enterprises pursue, but unsuccessfully attain, full control over enterprise data (the prized asset of any organization) such that data-, information-, knowledge-workers are just a click away from commencing coherent platform and database independent data drill-downs and/or discovery that transcend intranet, internet, and extranet boundaries -- serendipitous interaction with relevant data, without compromise!</p>

<p>Okay, situation analysis done, we move on..  </p>

<p>At our most recent (<a href="http://esw.w3.org/topic/CambridgeSemanticWebGatherings/Meeting/2007-06-12_Gathering">12th June</a>) monthly <a href="http://esw.w3.org/topic/CambridgeSemanticWebGatherings">Semantic Web Gathering</a>, I unveiled to <a href="http://www.w3.org/People/Berners-Lee/card#i">TimBL</a> and a host of other attendees a simple, but powerful, demonstration of how <a href="http://en.wikipedia.org/wiki/Linked_Data">Linked Data</a>, as an aspect of the <a href="http://www.businessweek.com/technology/content/apr2007/tc20070409_961951.htm">Semantic Data Web</a>, can be applied to enterprise data integration challenges.</p>

<h2>Actual SQL to RDF Mapping Demo / Experiment</h2>

<h4>Hypothesis</h4>
A SQL Schema can be effectively mapped declaratively to RDF such that SQL Rows morph into RDF Instance Data (Entity Sets) based on the Concepts &amp; Properties defined in a Concrete Conceptual Data Model oriented Data Dictionary (<a href="http://www.w3schools.com/rdf/rdf_schema.asp">RDF Schema</a> and/or <a href="http://www.w3schools.com/rdf/rdf_owl.asp">OWL Ontology</a>). In addition, the solution must demonstrate how &quot;Linked Data in the Web&quot; is completely different from &quot;Data on the Web&quot; or &quot;Linked Data on the Web&quot; (btw - <a href="http://kasei.us/people/Tom_Heath/">Tom Heath</a> eloquently unleashed this point in his recent <a href="http://blogs.talis.com/nodalities/2007/06/tom_heath_talks_with_talis_abo.php">podcast interview with Talis</a>).

<h4>Apparatus</h4>
An Ontology - in this case we simply derived the <a href="http://demo.openlinksw.com/DAV/home/demo/Public/Queries/SQLRDFIntegraton/Explore_Northwind_Ontology.isparql">Northwind Ontology</a> from the XML Schema based CSDL (<a href="http://blogs.msdn.com/adonet/archive/2007/01/30/entity-data-model-part-1.aspx">Conceptual Schema Definition Language</a>) used by Microsoft&#39;s public <a href="http://astoria.mslivelabs.com/Default.aspx">Astoria demo</a> (specifically the <a href="http://astoria.mslivelabs.com/termsOfUseNorthwind.aspx?returnURL=Northwind">Northwind Data Services demo</a>).  

SQL Database Schema - <a href="http://www.microsoft.com/library/media/1033/technet/images/prodtechnol/sql/2000/maintain/sscpop07_big.gif">Northwind</a> (comes bundled with ACCESS, SQL Server, and Virtuoso) comprised of tables such as: <a href="http://www.openlinksw.com/schemas/northwind#Customer">Customer</a>, <a href="http://www.openlinksw.com/schemas/northwind#Employee">Employee</a>, <a href="http://www.openlinksw.com/schemas/northwind#Product">Product</a>, <a href="http://www.openlinksw.com/schemas/northwind#Category">Category</a>, <a href="http://www.openlinksw.com/schemas/northwind#Supplier">Supplier</a>, <a href="http://www.openlinksw.com/schemas/northwind#Shipper">Shipper</a> etc.

<a href="http://www.openlinksw.com/virtuoso/">OpenLink Virtuoso</a> - SQL DBMS Engine (although this could have been any <a href="http://en.wikipedia.org/wiki/Open_Database_Connectivity">ODBC</a> or <a href="http://en.wikipedia.org/wiki/Java_Database_Connectivity">JDBC</a> accessible Database), <a href="http://www.openlinksw.com/virtuoso/Whitepapers/pdf/Virtuoso_SQL_to_RDF_Mapping.pdf">SQL-RDF Metaschema Language</a>, HTTP URL-rewriter, WebDAV Engine, and DBMS hosted XSLT processor

Client Tools -<a href="http://demo.openlinksw.com/isparql/"> iSPARQL Query Builder</a>, <a href="http://demo.openlinksw.com/DAV/JS/rdfbrowser/index.html">RDF Browser</a> (which could also have been <a href="http://www.w3.org/2005/ajar/tab">Tabulator</a> or<a href="http://sites.wiwiss.fu-berlin.de/suhl/bizer/ng4j/disco/"> DISCO</a> or a standard Web Browser)

<h4>Experiment / Demo</h4>
<ol>
<li>
Declaratively map the Northwind SQL Schema to RDF using the Virtuoso Meta Schema Language (see: <a href="http://demo.openlinksw.com/DAV/home/demo/Public/Queries/SQLRDFIntegraton/northwind_sql_rdf.sql">Virtuoso PL based Northwind_SQL_RDF script</a>)
</li>
<li>
Start browsing the data by clicking on the URIs that represent the RDF Data Model Entities resulting from the SQL to RDF Mapping 
</li>
</ol>

<h4>Observations</h4>
<ol>
<li>
Via a single Data Link click I was able to obtain specific information about the Customer represented by the URI <a href="http://demo.openlinksw.com/Northwind/Customer/ALFKI">&quot;ALFKI&quot;</a> (act of URI Dereferencing as you would an Object ID in an Object or Object-Relational Database) </li>
<li>
Via a 
<a href="http://demo.openlinksw.com/DAV/home/demo/Public/Queries/SQLRDFIntegraton/Explore_Northwind.isparql">Dynamic Data Page </a> I was able to explore all the entity relationships or specific entity data (i.e Exploratory or Entity specific dereferencing) in the Northwind Data Space
</li>
<li>
I was able to perform similar exploration (as per item 2) using our
<a href="http://demo.openlinksw.com/DAV/home/demo/Public/Queries/SQLRDFIntegraton/Explore_Northwind_Customer_ALFKI.wqx">OpenLink Browser. </a>
</li>
</ol>

<h4>Conclusions</h4>
<p>The vision of data, information, or knowledge at your fingertips is nigh! Thanks to the infrastructure provided by the Semantic Data Web (URIs, <a href="http://en.wikipedia.org/wiki/Resource_Description_Framework">RDF Data Model</a>, variety of RDF Serialization Formats[<a href="http://www.dajobe.org/2004/01/turtle/">1</a>][<a href="http://www.w3.org/DesignIssues/Notation3">2</a>][<a href="http://www.w3.org/TR/2002/WD-rdf-syntax-grammar-20020325/">3</a>], and Shared Data Dictionaries / Schemas / Ontologies [<a href="http://xmlns.com/foaf/spec/">1</a>][<a href="http://rdfs.org/sioc/spec/">2</a>][<a href="http://www.w3.org/TR/swbp-skos-core-guide/">3</a>][<a href="http://musicontology.com/">4</a>][<a href="http://bblfish.net/work/atom-owl/2006-06-06/AtomOwl.html">5</a>]) it&#39;s now possible to Virtualize enterprise data from the Physical Storage Level, through the Logical Data Management Levels (Relational), up to a Concrete Conceptual Model (Graph) without operating system, development environment or framework, or database engine lock-in.</p>

<h2>Next Steps</h2>
<p>We produce a shared ontology for the CRM and Business Reporting Domains. I hope this experiment clarifies how this is quite achievable by converting XML Schemas to RDF Data Dictionaries (RDF Schemas or Ontologies). Stay tuned :-) 
</p>
<p>Also watch <a href="http://news.com.com/1606-2-6189377.html">TimBL amplify and articulate Linked Data value</a> in a recent interview.</p>

<h2>Other Related Matters</h2>
<p>To deliver a mechanism that facilitates the crystallization of this reality is a contribution of boundless magnitude (as we shall all see in due course). Thus, it is easy to understand why even &quot;her majesty&quot;, the queen of England, simply had to get in on the act and <a href="http://www.royal.gov.uk/output/Page1880.asp">appoint TimBL to the &quot;British Order of Merit</a>&quot; :-)</p>

<p>Note: All of the demos above now work with IE &amp; Safari (a &quot;remember what Virtuoso is epiphany&quot;) by simply putting Virtuoso&#39;s DBMS hosted XSLT engine to use :-) This also applies to my earlier collection of demos from the <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=hello%20data%20web&type=text&output=html">Hello Data Web</a> and other <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=.isparql&type=text&output=html">Data Web &amp; Linked Data related demo style posts</a>.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-05-25#1203">
  <rss:title>Shared Ontologies Linked Data Style!</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-05-25T21:12:36Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">As the Linked Data meme beams across the Web, it is important to note that Ontology / Schema sharing and reuse is critical to the overall vitality of the burgeoning Semantic Data Web. The items that follow attempt to demonstrate the point by way of SIOC (Semantically-Interlinked Online Communities Ontology) and MO (Music Ontology) domain exploration: Linked Data or Dynamic Data Web Pages: Music Ontology Overview SIOC Ontology Overview SIOC Type Ontology Module (how you extend SIOC Concepts unobtrusively) SIOC Services Ontology Module (how you extend SIOC in relation to Services Modeling). Semantic Web Browser Sessions: Music Ontology Overview via OpenLink RDF Browser SIOC Ontology Overview via OpenLink RDF Browser SIOC Type Ontology Module via OpenLink RDF Browser SIOC Services Ontology Module via OpenLink RDF Browser. Key point, if you are modeling People, Communities, Organizations, Documents, and other entities in the People, Organizations, Documents etc. Data Space, don&#39;t forget to : FOAF-FOAF-FOAF it Up! :-)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
 <p>As the <a href="http://blogs.talis.com/nodalities/2007/05/linked_data_the_real_semantic.php">Linked Data meme</a> beams across the Web, it is important to note that Ontology / Schema sharing and reuse is critical to the overall vitality of the burgeoning Semantic Data Web.</p>  <p>The items that follow attempt to demonstrate the point by way of SIOC (<a href="http://rdfs.org/sioc/spec/">Semantically-Interlinked Online Communities Ontology</a>) and MO (<a href="http://musicontology.com/">Music Ontology</a>) domain exploration:</p> <p> <b>Linked Data or Dynamic Data Web Pages</b>:</p> <ol> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/music_ontology_overview.isparql">Music Ontology Overview</a> </li> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/sioc_ontology_overview.isparql">SIOC Ontology Overview</a> </li> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/sioc_types_ontology_module.isparql">SIOC Type Ontology Module</a> (how you extend SIOC Concepts unobtrusively)</li> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/sioc_services_ontology_overview.isparql">SIOC Services Ontology Module</a> (how you extend SIOC in relation to Services Modeling).</li> </ol>  <p> <b>Semantic Web Browser Sessions</b>:</p> <ol> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/browser_sessions/exploring_music_the_ontology.wqx">Music Ontology Overview via OpenLink RDF Browser</a> </li> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/browser_sessions/exploring_sioc.wqx">SIOC Ontology Overview via OpenLink RDF Browser</a> </li>  <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/browser_sessions/exploring_sioc_types_modules.wqx">SIOC Type Ontology Module </a>via OpenLink RDF Browser<br />  </li> <li>   <a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/browser_sessions/exploring_sioc_services.wqx">SIOC Services Ontology Module </a>via OpenLink RDF Browser.</li> </ol>  <p>Key point, if you are modeling People, Communities, Organizations, Documents, and other entities in the People, Organizations, Documents etc. Data Space, don&#39;t forget to : FOAF-FOAF-FOAF it Up! :-)</p> 
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-05-25#1202">
  <rss:title>Exploring FOAF Linked Data Style!</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-05-25T15:41:35Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Over the last few hours the FOAF project received a wakeup call via Dan Brickley&#39;s FOAF 0.9 &quot;touch&quot; effort. Naturally, this triggered an obvious opportunity to demonstrate the prowess of Linked Data on the Semantic Web. What follows is a quick dump of what I sent to the foaf-dev mailing list: Here are variety of FOAF Views built using: - OpenLink RDF Browser - Interactive SPARQL QBE - Raw SPARQL Endpoint Enabling you to explore the following lines: - FOAF Overview via a Linked Data Page (same as Dynamic Data Page) - FOAF Overview by Term Status via Linked Data Page - FOAF Overview SPARQL Query (.rq File) - FOAF Overview by Term Status - FOAF Overview via OpenLink RDF Browser</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Over the last few hours the FOAF project received a <a href="http://dannyayers.com/2007/05/25/foaf-0">wakeup call</a> via <a href="http://danbri.org/">Dan Brickley</a>&#39;s <a href="http://xmlns.com/foaf/spec">FOAF 0.9</a> &quot;touch&quot; effort.</p>

<p>Naturally, this triggered an obvious opportunity to demonstrate the prowess of <a href="http://en.wikipedia.org/wiki/Linked_Data">Linked Data</a> on the Semantic Web. What follows is a quick dump of what I sent to the <a href="http://lists.foaf-project.org/mailman/listinfo/foaf-dev">foaf-dev</a> mailing list:</p>

<p>Here are variety of FOAF Views built using:</p>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/DAV/JS/rdfbrowser/index.html">OpenLink RDF Browser</a> 
</ul>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/isparql">Interactive SPARQL QBE </a>
</ul>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/sparql">Raw SPARQL Endpoint</a>
</ul>

<p>Enabling you to explore the following lines:</p>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/foaf_overview.isparql">FOAF Overview via a Linked Data Page</a> (same as Dynamic Data Page) 
</ul>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/linked_data_pages/foaf_overview_by_status.isparql">FOAF Overview by Term Status via Linked Data Page</a>
</ul>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/queries/foaf_overview.rq">FOAF Overview SPARQL Query (.rq File)</a>
</ul>
<ul>
-
<a href="http://dbpedia.openlinksw.com:8890/DAV/home/demo/dataweb/queries/foaf_overview_by_status.rq">FOAF Overview by Term Status</a>
</ul>
<ul>
-
<a href="http://tinyurl.com/2hpeau">FOAF Overview via OpenLink RDF Browser</a>
</ul>



]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-22#1165">
  <rss:title>Data Web, Googlebase, and Yahoo!</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-03-22T23:04:21Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">A defining characteristic of the Data Web (Context Oriented Web 3.0) is that it facilitates Meshups rather than Mashups. Quick Definitions: Mashups - Brute force joining of disparate Web Data Meshups - Natural joining of disparate Web Data Reasons for the distinction: Mashups are Data Model oblivious. Meshups are Data Model driven. Examples: Mashups are based on RSS 2.0 most of the time (RSS 2.0 is at best a Tree Structure that contains untyped or meaning challenged links. Meshups are RDF based and the data is self describing since the links are typed (posses inherent meaning thereby providing context). So what? You may be thinking. For starters, I can quite easily Mesh data from Googlebase (which emits RSS 2.0 or Atom) and other data sources with the Mapping Services from Yahoo! I can achieve this in minutes without writing a single line of code. I can do it because of the Data Model prowess of RDF (self-describing instance-data), the data interchange and transformation power of XML and XSLT respectively, the inherent power of XML based Web Services (REST or SOAP), and of course, having a Hybrid Server product like Virtuoso at my disposal that delivers a cross platform solution for exploiting all of these standards coherently. I can share the self-describing describing data source that serves my Meshup. Try reusing the data presented by a Mashup via the same URL that you used to locate Mashup to get my drift. Demo Links: Googlebase Query URL as an RDF Data Source Perform a simple Data Mesh by adding (via link copy and paste) this Upcoming.org Query Services URL for Ajax Events to the RDF Browsers list of Data Sources (paste into the Data Source URI input field). What does this all mean? &quot;Context&quot; is the catalyst of the burgeoning Data Web (Semantic Web Layer - 1). It&#39;s the emerging appreciation of &quot;Context&quot; that is driving the growing desire to increment Web versions from 2.0 to 3.0. It also the the very same &quot;Context&quot; that has been a preoccupation of Semantic Web vision since its inception. The journey towards a more Semantic Web is all inclusive (all &quot;ANDs&quot; and no &quot;ORs&quot; re. participation). The Semantic Web is self-annotating. Web 2.0 has provided a huge contribution to the self annotation effort: on the Web we now have Data Spaces for Bookmarks (e.g del.icio.us), Image Galleries ( e.g Flickr), Discussion Forums (remember those comments associated with blog posts? ditto the pingbacks and trackbacks?), People Profiles (FOAF, XFN, del.icio.us, and those crumbling walled-gardens around many Social Networks), and more.. A Web without granular access to Data is simply not a Web worth having (think about the menace of click-fraud and spam).</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>A defining characteristic of the Data Web (Context Oriented Web 3.0) is that it facilitates Meshups rather than Mashups.</p>

<p>Quick Definitions:</p>
<ul>
Mashups - Brute force joining of disparate Web Data</ul>
<ul>
Meshups - Natural joining of disparate Web Data 
</ul>
<p>
Reasons for the distinction:</p>
<ul>Mashups are Data Model oblivious.</ul>
<ul>Meshups are Data Model driven.</ul>

<p>Examples:</p>

<ul>
Mashups are based on RSS 2.0 most of the time (RSS 2.0 is at best a Tree Structure that contains untyped or meaning challenged links.</ul>
<ul>
Meshups are RDF based and the data is self describing since the links are typed (posses inherent meaning thereby providing context).</ul>

<p>So what? You may be thinking.</p>
<p>For starters, I can quite easily Mesh data from Googlebase (which emits RSS 2.0 or Atom) and other data sources with the Mapping Services from Yahoo!</p>

<p>I can achieve this in minutes without writing a single line of code. I can do it because of the Data Model prowess of RDF (self-describing instance-data), the data interchange and transformation power of XML and XSLT respectively, the inherent power of XML based Web Services (REST or SOAP), and of course, having a Hybrid Server product like <a href="http://en.wikipedia.org/wiki/Virtuoso_Universal_Server">Virtuoso</a> at my disposal that delivers a cross platform solution for exploiting all of these standards coherently.</p>

<p>I can share the self-describing describing data source that serves my Meshup. Try reusing the data presented by a Mashup via the same URL that you used to locate Mashup to get my drift.</p>

<p>Demo Links:</p>

<ol>
<li>
  <a href="http://demo.openlinksw.com/DAV/JS/rdfbrowser/index.html#http%3A%2F%2Fdemo.openlinksw.com%2FDAV%2Fhome%2Fdemo%2FPublic%2FQueries%2FDataWeb%2Fgoogle_base_jobs_dataspace.isparql">Googlebase Query URL as an RDF Data Source</a>
</li>
<li>Perform a simple Data Mesh by adding (via link copy and paste) this <a href="http://upcoming.org/search/?q=ajax&scope=allmetros&type=Events">Upcoming.org Query Services URL for Ajax Events</a> to the RDF Browsers list of Data Sources (paste into the Data Source URI input field).</li>
</ol>
<p>What does this all mean?</p>
<p>&quot;Context&quot; is the catalyst of the burgeoning Data Web (Semantic Web Layer - 1). It&#39;s the <a href="http://sramanamitra.com/blog/729">emerging appreciation of &quot;Context&quot;</a> that is driving the growing desire to increment Web versions from 2.0 to 3.0. It also the the very same &quot;Context&quot; that has been a preoccupation of <a href="http://www.w3.org/2001/sw/Activity">Semantic Web vision</a> since its inception.</p>
<p>The journey towards a more Semantic Web is all inclusive (all &quot;ANDs&quot; and no &quot;ORs&quot; re. participation).</p>
<p>The Semantic Web is <a href="http://www.openlinksw.com/blog/~kidehen/?id=887">self-annotating</a>. Web 2.0 has provided a huge contribution to the self annotation effort: on the Web we now have Data Spaces for Bookmarks (e.g del.icio.us), Image Galleries ( e.g Flickr), Discussion Forums (remember those comments associated with blog posts? ditto the pingbacks and trackbacks?), People Profiles (FOAF, XFN, del.icio.us, and those crumbling walled-gardens around many Social Networks), and more..</p>
<p>A Web without granular access to Data is simply not a Web worth having (think about the menace of click-fraud and spam).</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-09#1157">
  <rss:title>SPARQL and Full Text Indexing implementations are growing</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-03-09T23:50:29Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Virtuoso joins Boca and ARC 2.0 as RDF Quad or Triple Stores with Full Text Index extensions to SPARQL. Here is our example applied to DBpedia: PREFIX dbpedia: &lt;http://dbpedia.org/&gt; PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt; PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt; SELECT ?name ?birth ?death FROM &lt;http://dbpedia.org&gt; WHERE {    ?person dbpedia:birthplace &lt;http://dbpedia.org/resource/Berlin&gt; .    ?person dbpedia:birth ?birth .    ?person foaf:name ?name .    ?person dbpedia:death ?death    FILTER (?birth &lt; &quot;1900-01-01&quot;^^xsd:date and bif:contains (?name, &#39;otto&#39;)) . } ORDER BY ?name You can test further using our SPARQL Endpoint for DBpedia or via the DBPedia bound Interactive SPARQL Query Builder or just click *Here* for results courtesy of the SPARQL Protocol (REST based Web Service). Note: This is in-built functionality as Virtuoso has possessed Full Text Indexing since 1998-99. This capability applies to physical and virtual graphs managed by Virtuoso. A per usual, there is more to come as we now have a nice intersection point for SPARQL and XQuery/XPath since Triple Objects (the Literal variety) can take the form of XML Schema based Complex Types :-) A point I alluded too in my podcast interview with Jon Udell last year (*note: mechanical turk based transcript is bad*). The point I made went something like this: &quot;...you use SPARQL to traverse the typed links and then use XPath/XQuery for further granular access to the data if well-formed...&quot; Anyway, the podcast interview lead to this InfoWorld article titled: Unified Data Theory.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
   <p> <a href="http://en.wikipedia.org/wiki/Virtuoso_Universal_Server">Virtuoso</a> joins <a href="http://wingerz.com/blog/2007/02/06/text-indexing-and-query-in-boca/">Boca</a> and <a href="http://seaborne.blogspot.com/2006/11/larq-lucene-arq.html">ARC 2.0</a> as RDF Quad or Triple Stores with Full Text Index extensions to SPARQL. Here is our example applied to <a href="http://dbpedia.org">DBpedia</a>:</p> <pre><font size="2">PREFIX dbpedia: &lt;http://dbpedia.org/&gt;
PREFIX foaf: &lt;http://xmlns.com/foaf/0.1/&gt;
PREFIX xsd: &lt;http://www.w3.org/2001/XMLSchema#&gt;
SELECT ?name ?birth ?death
FROM &lt;http://dbpedia.org&gt;
WHERE {
   ?person dbpedia:birthplace &lt;http://dbpedia.org/resource/Berlin&gt; .
   ?person dbpedia:birth ?birth .
   ?person foaf:name ?name .
   ?person dbpedia:death ?death
   FILTER (?birth &lt; &quot;1900-01-01&quot;^^xsd:date and bif:contains (?name,
&#39;otto&#39;)) .
}
ORDER BY ?name

</font></pre>  <p> You can test further using our <a href="http://demo3.openlinksw.com:8890/sparql/">SPARQL Endpoint for DBpedia</a> or via the <a href="http://demo3.openlinksw.com:8890/isparql/">DBPedia bound Interactive SPARQL Query Builder</a> or just click *<a href="http://demo3.openlinksw.com:8890/sparql/?default-graph-uri=&query=PREFIX+dbpedia%3A+%3Chttp%3A%2F%2Fdbpedia.org%2F%3E%0D%0APREFIX+foaf%3A+%3Chttp%3A%2F%2Fxmlns.com%2Ffoaf%2F0.1%2F%3E%0D%0APREFIX+xsd%3A+%3Chttp%3A%2F%2Fwww.w3.org%2F2001%2FXMLSchema%23%3E%0D%0ASELECT+%3Fname+%3Fbirth+%3Fdeath%0D%0AFROM+%3Chttp%3A%2F%2Fdbpedia.org%3E%0D%0AWHERE+%7B%0D%0A++++%3Fperson+dbpedia%3Abirthplace+%3Chttp%3A%2F%2Fdbpedia.org%2Fresource%2FBerlin%3E+.%0D%0A++++%3Fperson+dbpedia%3Abirth+%3Fbirth+.%0D%0A++++%3Fperson+foaf%3Aname+%3Fname+.%0D%0A++++%3Fperson+dbpedia%3Adeath+%3Fdeath%0D%0A++++FILTER+%28%3Fbirth+%3C+%221900-01-01%22%5E%5Exsd%3Adate+and+bif%3Acontains+%28%3Fname%2C+%27otto%27%29%29+.%0D%0A%7D%0D%0AORDER+BY+%3Fname&format=text%2Fhtml">Here</a>* for results courtesy of the <a href="http://www.w3.org/TR/rdf-sparql-protocol/">SPARQL Protocol</a> (REST based Web Service). </p> <p>Note: This is in-built functionality as Virtuoso has possessed <a href="http://en.wikipedia.org/wiki/Full_text_index">Full Text Indexing</a> since 1998-99. This capability applies to physical and virtual graphs managed by Virtuoso.</p> <p>A per usual, there is more to come as we now have a nice intersection point for SPARQL and XQuery/XPath since Triple Objects (the Literal variety) can take the form of XML Schema based Complex Types :-) A point I alluded too in my <a href="http://weblog.infoworld.com/udell/2006/04/28.html">podcast interview with Jon Udell </a>last year (*note: mechanical turk based transcript is bad*). The point I made went something like this: &quot;...you use SPARQL to traverse the typed links and then use XPath/XQuery for further granular access to the data if well-formed...&quot;</p> <p>Anyway, the podcast interview lead to this InfoWorld article titled: <a href="http://www.infoworld.com/article/06/05/03/77873_19OPstrategic_1.html">Unified Data Theory</a>.<br /> </p>   
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-03-01#1148">
  <rss:title>Personal URIs &amp; Data Spaces</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-03-01T19:42:41Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Linking personal posted content across communities: &quot; With the help of Kingsley, Uldis and I have been looking at how SIOC can be used to link the content that a single person posts to a number of community sites. The picture below shows an example of stuff that Iâve created on Flickr, YouTube, etc. through my various user identities on those sites (these match some SIOC types that we want to add to a separate module). We can also say that each Web 2.0 content item is a user-contributed post, with some attached or embedded content (e.g. a file or maybe just some metadata). This is part of a new discussion on the sioc-dev mailing list, and weâd value your contributions. Edit: The inner layer is a person (semantically described in FOAF), the next layer is their user accounts (described in FOAF, SIOC) and the outer layer is the posted content - text, files, associated metadata - on community sites (again described using SIOC). No Tags&quot; (Via John Breslin - Cloudlands.) The point that John is making about the Data Web and Interlinked Data Spaces exposed via URIs (e.g Personal URIs), crystallizes a number of very important issues about the Data Web that may remain unclear. I am hoping that by digesting the post excerpt above, in conjunction with the items below, aids the pursuit of clarity and comprehension about the all important Data Web (Semantic Web - Layer 1): Your OpenID can be Your Personal URI (as noted by Henry Story&#39;s post about: The Many Uses of OpenID). That that&#39;s what I have courtesy of OpenLink Data Spaces (ODS) The above only works unobtrusively (i.e. OpenID and Personal sharing a URI) if Content Negotiation is exploited on the Client and Server sides. TimBL&#39;s call out to Share Your Data and Link to Other Data via URIs via post titled: Give Yourself a URI. W3C&#39;s Best Practice Recipes for Publishing RDF Vocabularies W3C&#39;s Architecture of the World Wide Web - Vol 1 which covers URI Dereferencing (HTTP GET-ing the data that a URI points to) Richard Cyganiak&#39;s post titled: Debugging Semantic Web Sites with Curl. Examples of some of these principles in practice: Chris Bizer, Tobias Gaub, and Richard&#39;s Javascript based Semantic Web Client Library DISCO RDF Browser OpenLink Ajax Toolkit&#39;s (OAT) RDF Browser OpenLink Interactive SPARQL Query by Example (iSPARQL QBE) Dynamic Data Web Pages from my prior posts [1][2][3] dbpedia (Wikipedia as a Data Web oriented Data Source) And of course this blog post&#39;s permalink is a bona fide dereferencable URI. And of course there is more to come such as Grandma&#39;s Semantic Web Browser which is coming from Zitgist LLC (pronounced: Zeitgeist) a joint venture of OpenLink Software and Frederick Giasson.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<blockquote>
<p>
  <a href="http://www.johnbreslin.com/blog/2007/03/01/linking-personal-posted-content-across-communities/#comments">Linking personal posted content across communities</a>: &quot;</p>
<p>With the help of Kingsley, Uldis and I have been looking at how <a href="http://sioc-project.org/">SIOC</a> can be used to link the content that a single person posts to a number of community sites.  The picture below shows an example of stuff that Iâve created on Flickr, YouTube, etc. through my various user identities on those sites (these match some <a href="http://wiki.sioc-project.org/index.php/TypesModule">SIOC types</a> that we want to add to a separate module).  We can also say that each Web 2.0 content item is a user-contributed post, with some attached or embedded content (e.g. a file or maybe just some metadata).  This is part of a new discussion on the <a href="http://groups.google.com/group/sioc-dev">sioc-dev</a> mailing list, and weâd value your contributions.</p>
<p>
  <img id="image1178" src="http://www.johnbreslin.com/blog/wp-content/uploads/2007/03/20070228a.png" alt="20070228a.png" />
</p>
<p>Edit: The inner layer is a person (semantically described in FOAF), the next layer is their user accounts (described in FOAF, SIOC) and the outer layer is the posted content - text, files, associated metadata - on community sites (again described using SIOC).
</p>
No Tags&quot;

<p>(Via <a href="http://www.johnbreslin.com/blog">John Breslin - Cloudlands</a>.)</p>
</blockquote>
<p>The point that John is making about the Data Web and Interlinked <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q='data%20spaces'&type=text&output=html">Data Spaces</a> exposed via <a href="http://en.wikipedia.org/wiki/Uniform_Resource_Identifier">URI</a>s (e.g Personal URIs), crystallizes a number of very important issues about the Data Web that may remain unclear. I am hoping that by digesting the post excerpt above, in conjunction with the items below, aids the pursuit of clarity and comprehension about the all important Data Web (Semantic Web - Layer 1):</p>
<ol>
<li>
  <a href="http://kidehen.idehen.net/dataspace/kidehen">Your OpenID can be Your Personal URI</a> (as noted by <a href="http://blogs.sun.com/bblfish/">Henry Story</a>&#39;s post about: <a href="http://blogs.sun.com/bblfish/entry/openid_for_blogs_sun_com">The Many Uses of OpenID</a>). That that&#39;s what I have courtesy of OpenLink Data Spaces (ODS)</li>
<li>The above only works unobtrusively (i.e. OpenID and Personal sharing a URI) if Content Negotiation is exploited on the Client and Server sides.</li>
<li>
  <a href="http://www.w3.org/People/Berners-Lee/card.rdf">TimBL</a>&#39;s call out to <a href="http://www.w3.org/DesignIssues/LinkedData.html">Share Your Data and Link to Other Data</a> via URIs via post titled: <a href="http://dig.csail.mit.edu/breadcrumbs/node/71">Give Yourself a URI</a>.</li>
<li>
  <a href="http://www.w3.org/TR/swbp-vocab-pub/">W3C&#39;s Best Practice Recipes for Publishing RDF Vocabularies</a>
</li>
<li>
  <a href="http://www.w3.org/TR/2004/REC-webarch-20041215/#dereference-uri">W3C&#39;s Architecture of the World Wide Web - Vol 1</a> which covers URI Dereferencing (HTTP GET-ing the data that a URI points to)</li>
<li>
  <a href="http://www4.wiwiss.fu-berlin.de/is-group/page/persons/Person6">Richard Cyganiak</a>&#39;s post titled: <a href="http://dowhatimean.net/2007/02/debugging-semantic-web-sites-with-curl">Debugging Semantic Web Sites with Curl</a>.</li>
</ol>
<p>Examples of some of these principles in practice:</p>
<ol>
<li>Chris Bizer, Tobias Gaub, and Richard&#39;s Javascript based<a href="http://sites.wiwiss.fu-berlin.de/suhl/bizer/ng4j/semwebclient/"> Semantic Web Client Library</a>
</li>
<li>
  <a href="http://sites.wiwiss.fu-berlin.de/suhl/bizer/ng4j/disco/">DISCO RDF Browser</a>
</li>
<li>
  <a href="http://oat.openlinksw.com">OpenLink Ajax Toolkit</a>&#39;s (OAT) <a href="http://demo.openlinksw.com/DAV/JS/tests/rdfbrowser/index.html">RDF Browser</a>
</li>
<li>OpenLink <a href="http://demo.openlinksw.com/isparql">Interactive SPARQL Query by Example</a> (iSPARQL QBE)</li>
<li>Dynamic Data Web Pages from my prior posts [<a href="http://www.openlinksw.com/blog/~kidehen/?id=1144">1</a>][<a href="http://www.openlinksw.com/blog/~kidehen/index.vspx?page=&id=1145">2</a>][<a href="http://www.openlinksw.com/blog/~kidehen/index.vspx?page=&id=1146">3</a>]</li>
<li>
  <a href="http://dbpedia.org/docs/">dbpedia</a> (Wikipedia as a Data Web oriented Data Source)</li>
<li>And of course this blog post&#39;s permalink is a bona fide dereferencable URI.</li>
</ol>
<p>And of course there is more to come such as Grandma&#39;s Semantic Web Browser which is coming from <a href="http://fgiasson.com/blog/index.php/2007/02/18/zitgist_a_semantic_web_search_engine">Zitgist LLC</a> (pronounced: Zeitgeist) a joint venture of OpenLink Software and <a href="http://fgiasson.com/blog/">Frederick Giasson</a>.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-02-24#1143">
  <rss:title>Our Basic Human Instincts</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-02-24T01:03:38Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I just overheard the following dialog between my six year old son and his play date: Play Date: What is that thing on the Wall? My Son: Security Alarm Play Date: How does it work My Son: If you click on that top button and then open the door, I will have to enter a code when we come back in or the alarm will go off Play Date: What is the code? My Son: I can&#39;t tell you that! Play Date: Why not? My Son: You might come and steal something from our house! Play Date: No I won&#39;t! My Son: Well, you might tell someone that might come and steal something from our house! or that person could tell someone who could tell someone that would steal from our house LOL!! of course! At the same time wondering, how come a majority of adults don&#39;t quite see the need for granular access to Web Data in a manner that enables computers and humans to collectively arrive at similar decisions? Putting Data in context en route to producing actionable knowledge is a transient endeavor that engages a myriad of human senses. We demonstrate comprehension of this fact in our daily existence as social creatures (at a very early age as depicted above). That said, we seem to forget this fact when engaging the Web: If we can&#39;t see it then it can&#39;t be valuable. BTW - I just received a ping about the &quot;Sensory Web&quot; (which is just another way of describing a Data Driven Web experience from my vantage point.) In the popular M-V-C pattern you don&#39;t see the &quot;M&quot;, but the &quot;M&quot; will kill you if you get it wrong (it is the FORCE)! Coming to think about it, the pattern could have been coined: V-C-M or C-M-V, but isn&#39;t for obvious reasons :-) RDF is the vehicle that enables us tap into the Data aspect of the Web. We started off with pages of blurb linked via hypertext (Web 1.0) and then looked to &quot;Keywords&quot; for some kind of data access; we then isolated some &quot;Verbs&quot; and discovered another dimension of Web Interaction (Web 2.0) but looked to these &quot;Verbs&quot; for data access which left us with Mashups; and now we are starting to extract &quot;Nouns&quot; and &quot;Adjectives&quot; from sentences (Subject, Predicate, Object - Triples) associated with resources on the Web (Data Web / Web 3.0 / Semantic Web Layer 1) which provides a natural data access substrate for Meshups (natural joining of disparate data from a plethora of data sources) while providing the foundation layer for the Semantic Web. For those who need use-cases that demonstrate tangible value re. the Semantic Web, here are some projects to note courtesy of the Semantic Web Education and Outreach (SWEO) interest group: FOAF based White-lists - Attacking SPAM Open Data Access and Linking for the Data Web - Data Integration and Generation effort that creates a cluster of RDF instance data from a myriad of data sources relating to every day things such as: People, Places, Events, Projects, Discussions, Music, Books, and other things Content Labeling - Protecting our kids on the Web amongst other matters relating to knowledge about data sources Others.. Related posts: Data Web and Global Data Integration &amp; Generation Effort Previous Data Web posts.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>I just overheard the following dialog between my six year old son and his play date:</p>

<blockquote> <pre>
Play Date: What is that thing on the Wall?
My Son: Security Alarm
Play Date: How does it work
My Son: If you click on that top button and then open the door, I will have to enter a code when we come back in or the alarm will go off
Play Date: What is the code?
My Son: I can&#39;t tell you that!
Play Date: Why not?
My Son: You might come and steal something from our house!
Play Date: No I won&#39;t!
My Son: Well, you might tell someone that might come and steal something from our house! or that person could tell someone who could tell someone that would steal from our house</pre></blockquote>

<p>LOL!! of course! At the same time wondering, how come a majority of adults don&#39;t quite see the need for granular access to Web Data in a manner that enables computers and humans to collectively arrive at similar decisions? </p>

<p>Putting Data in context en route to producing actionable knowledge is a transient endeavor that engages a myriad of human senses. We demonstrate comprehension of this fact in our daily existence as social creatures (at a very early age as depicted above). That said, we seem to forget this fact when engaging the Web: If we can&#39;t see it then it can&#39;t be valuable.</p>
<blockquote>
<p>BTW - I just received a ping about the &quot;<a href="http://www.flickr.com/photos/route79/399029535/">Sensory Web</a>&quot; (which is just another way of describing a Data Driven Web experience from my vantage point.)</p>
</blockquote>
<p>In the popular M-V-C pattern you don&#39;t see the &quot;M&quot;, but the &quot;M&quot; will kill you if you get it wrong (it is the FORCE)! Coming to think about it, the pattern could have been coined: V-C-M or C-M-V, but isn&#39;t for obvious reasons :-)</p>
<p>RDF is the vehicle that enables us tap into the Data aspect of the Web. We started off with pages of blurb linked via hypertext (Web 1.0) and then looked to &quot;Keywords&quot; for some kind of data access; we then isolated some &quot;Verbs&quot; and discovered another dimension of Web Interaction (Web 2.0) but looked to these &quot;Verbs&quot; for data access which left us with Mashups;  and now we are starting to extract &quot;Nouns&quot; and &quot;Adjectives&quot; from sentences (Subject, Predicate, Object - Triples) associated with resources on the Web (Data Web / Web 3.0 / Semantic Web Layer 1) which provides a natural data access substrate for <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=meshups&type=text&output=html">Meshups</a> (natural joining of disparate data from a plethora of data sources) while providing the foundation layer for the <a href="http://en.wikipedia.org/wiki/Semantic_Web">Semantic Web</a>.</p> 
<p>For those who need use-cases that demonstrate tangible value re. the Semantic Web, here are some projects to note courtesy of the <a href="http://www.w3.org/2001/sw/sweo/">Semantic Web Education and Outreach</a> (SWEO) interest group: </p>
<ol>
<li>
<a href="http://esw.w3.org/topic/SweoIG/TaskForces/CommunityProjects/FOAFWhitelisting">FOAF based White-lists</a> - Attacking SPAM 
</li>
<li>
<a href="http://esw.w3.org/topic/SweoIG/TaskForces/CommunityProjects/LinkingOpenData">Open Data Access and Linking for the Data Web</a> - Data Integration and Generation effort that creates a cluster of RDF instance data from a myriad of data sources relating to every day things such as: People, Places, Events, Projects, Discussions, Music, Books, and other things 
</li>
<li>
<a href="http://esw.w3.org/topic/SweoIG/TaskForces/CommunityProjects/PowderExtension">Content Labeling</a> - Protecting our kids on the Web amongst other matters relating to knowledge about data sources
</li>
<li>
<a href="http://esw.w3.org/topic/SweoIG/TaskForces/CommunityProjects">Others..</a>
</li>
</ol>

Related posts:
<ol>
<li>
  <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=rdf%20data%20integration&type=text&output=html">Data Web and Global Data Integration &amp; Generation Effort</a>
</li>
<li>
  <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q='data%20web'&type=text&output=html">Previous Data Web posts</a>.</li>
</ol>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2007-01-22#1123">
  <rss:title>Semantic Web Data Generation Activity: FOAF Crawling</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2007-01-22T15:57:31Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Frederick Giasson provides compelling data that supports the view that the Semantic Web bootstrap is a global Data Integration &amp; Data Generation effort that inevitably involves a variety of Data Sources such as: social networks, blogs, wikis etc. The Data in Fred&#39;s post is based on FOAF Ontology instance data generated from a myriad of Data Sources.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://fgiasson.com">Frederick Giasson</a> provides compelling data that supports the view that the <a href="http://www.openlinksw.com/blog/~kidehen/?id=1122">Semantic Web bootstrap is a global Data Integration &amp; Data Generation effort</a> that inevitably involves a variety of Data Sources such as: social networks, blogs, wikis etc.</p>
<p> The Data in Fred&#39;s post is based on <a href="http://fgiasson.com/blog/index.php/2007/01/21/reaching_at_least_600_000_people_with_19">FOAF Ontology instance data generated from a myriad of Data Sources</a>.</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-10-18#1064">
  <rss:title>Virtuoso&#39;s SQL Schema to RDF Ontology Mapping Language (1.0)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-10-18T22:18:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">A new technical white paper about our declarative language for SQL Schema to RDF Ontology Mapping has just been published. What is this? A declarative language adapted from SPARQL&#39;s graph pattern language (N3/Turtle) for mapping SQL Data to RDF Ontologies. We currently refer to this as a Graph Pattern based RDF VIEW Definition Language. Why is it important? It provides an effective mechanism for exposing existing SQL Data as virtual RDF Data Sets (Graphs) negating the data duplication associated with generating physical RDF Graphs from SQL Data en route to persistence in a dedicated Triple Store. Enterprise applications (traditional and web based) and most Web Applications (Web 1.0 and Web 2.0) sit atop relational databases, implying that SQL/RDF model and data integration is an essential element of the burgeoning &quot;Data Web&quot; (Semantic Web - Layer 1) comprehension and adoption process. In a nutshell, this is a quick route for non disruptive exposure of existing SQL Data to SPARQL supporting RDF Tools and Development Environments. How does it work? RDF Side locate one or more Ontologies (e.g FOAF, SIOC, AtomOWL, SKOS etc.) that effectively defines the Concepts (Classes) and Terms (Predicates) to be exposed via your RDF Graph Using the Virtuoso&#39;s RDF View Definition Language declare a International Resource Identifier (or URI) for your Graph. Example:CREATE GRAPH IRI(&quot;http://myopenlink.net/dataspace&quot;) Then create Classes (Concepts), Class Properties/Predicates (Memb), and Class Instances (Inst) for the new Graph. Example: CREATE IRI CLASS odsWeblog:feed_iri &quot;http://myopenlink.net/dataspace/kidehen/weblog/MyFeeds&quot; ( in memb varchar not null, in inst varchar not null) SQL Side If Virtuoso isn&#39;t your SQL Data Store, Identify the ODBC or JDBC SQL data source(s) containing the SQL data to be mapped to RDF and then link the relevant tables into Virtuoso&#39;s Virtual DBMS Layer Then use the RDF View Definition Language&#39;s graph pattern feature to generate SQL to RDF Mapping Template for your Graph. As shown in this ODS Weblog -&gt; AtomOWL Mapping example.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>A new technical white paper about our declarative language for SQL Schema to RDF Ontology Mapping has just been published.</p>

<h2>What is this?</h2>
<p>A declarative language adapted from SPARQL&#39;s graph pattern language (N3/Turtle) for mapping SQL Data to RDF Ontologies. We currently refer to this as a Graph Pattern based RDF VIEW Definition Language.</p>

<h2>Why is it important?</h2>
<p>It provides an effective mechanism for exposing existing SQL Data as virtual RDF Data Sets (Graphs) negating the data duplication associated with generating physical RDF Graphs from SQL Data en route to persistence in a dedicated Triple Store. </p>

<p>Enterprise applications (traditional and web based) and most Web Applications (Web 1.0 and Web 2.0) sit atop relational databases, implying that SQL/RDF model and data integration is an essential element of the burgeoning &quot;Data Web&quot; (Semantic Web - Layer 1) comprehension and adoption process.</p>

<p>In a nutshell, this is a quick route for non disruptive exposure of existing SQL Data to SPARQL supporting RDF Tools and Development Environments.</p>

<h2>How does it work?</h2>
<h3>RDF Side</h3>
<ol>
<li>locate one or more Ontologies (e.g FOAF, SIOC, AtomOWL, SKOS etc.) that effectively defines the Concepts (Classes) and Terms (Predicates) to be exposed via your RDF Graph</li>

<li>Using the Virtuoso&#39;s RDF View Definition Language declare a International Resource Identifier (or URI) for your Graph. Example:<pre>CREATE GRAPH IRI(&quot;http://myopenlink.net/dataspace&quot;)</pre>  </li>
<li>Then create Classes (Concepts), Class Properties/Predicates (Memb), and Class Instances (Inst) for the new Graph. Example: <pre>CREATE IRI CLASS odsWeblog:feed_iri  &quot;http://myopenlink.net/dataspace/kidehen/weblog/MyFeeds&quot; (
  in memb varchar not null, in inst varchar not null)</pre>
</li>
</ol>
<h3>SQL Side</h3>
<ol>
<li>If Virtuoso isn&#39;t your SQL Data Store, Identify the ODBC or JDBC SQL data source(s) containing the SQL data to be mapped to RDF and then link the relevant tables into Virtuoso&#39;s Virtual DBMS Layer</li>

<li>Then use the RDF View Definition Language&#39;s graph pattern feature to generate SQL to RDF Mapping Template for your Graph. As shown in this <a href="http://www.usnet.private:8889/wiki/main/Main/VOSSQLRDF#MappingOdsBlogToAtomOwlExample">ODS Weblog -&gt; AtomOWL Mapping example</a>.</li>

</ol>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-09-07#1036">
  <rss:title>Creating connections between discussion clouds with SIOC</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-09-08T00:56:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Another example of Data Spaces in action by John Breslin.. In this case John visualizes the connections that are exploitable by creating SIOC (Semantically-Interlinked Online Communities) instance data from existing Distributed Collaborative Application profiles (Web 2.0 in current parlance). Of course, SIOC is an Ontology for RDF data since it describes the Concepts and Terms for a a network mesh of online communities. Which by implication provides another insight into the realization that the Web we know has always been a &quot;Web of Databases&quot; (federation of Graph Model Databases encapsulated in Data Spaces). The emergence of SPARQL as the standard Query Language for querying RDF Data Sets, alongside the SPARQL Protocol for transmitting SPARQL Queries over HTTP, and the SPARQL Query Results Serialization formats (XML or JSON) Results Serialization Format), basically set the stage truly open and flexible data access across Web Data Space clusters such as: the Blogosphere, Wikispehere, Usenetverse, Linkspaces, Boardscapes, and others. For additional clarity re. my comments above, you can also look at the SPARQL &amp; SIOC Usecase samples document for our OpenLink Data Spaces platform. Bottom line, the Semantic Web and SPARQL aren&#39;t BORING. In fact, quite the contrary, since they are essential ingredients of a more powerful Web than the one we work with today! Enjoy the rest of John&#39;s post: Creating connections between discussion clouds with SIOC: (Extract from our forthcoming BlogTalk paper about browsers for SIOC.) SIOC provides a unified vocabulary for content and interaction description: a semantic layer that can co-exist with existing discussion platforms. Using SIOC, various linkages are created between the aforementioned concepts, which allow new methods of accessing this linked data, including: Virtual Forums. These may be a gathering of posts or threads which are distributed across discussion platforms, for example, where a user has found posts from a number of blogs that can be associated with a particular category of interest, or an agent identifies relevant posts across a certain timeframe. Distributed Conversations. Trackbacks are commonly used to link blog posts to previous posts on a related topic. By creating links in both directions, not only across blogs but across all types of internet discussions, conversations can be followed regardless of what point or URI fragment a browser enters at. Unified Communities. Apart from creating a web page with a number of relevant links to the blogs or forums or people involved in a particular community, there is no standard way to define what makes up an online community (apart from grouping the people who are members of that community using FOAF or OPML). SIOC allows one to simply define what objects are constituent parts of a community, or to say to what community an object belongs (using sioc:has_part / part_of): users, groups, forums, blogs, etc. Shared Topics. Technorati (a search engine for blogs) and BoardTracker (for bulletin boards) have been leveraging the free-text tags that people associate with their posts for some time now. SIOC allows the definition of such tags (using the subject property), but also enables hierarchial or non-hierarchial topic definition of posts using sioc:topic when a topic is ambiguous or more information on a topic is required. Combining with other Semantic Web vocabularies, tags and topics can be further described using the SKOS organisation system. One Person, Many User Accounts. SIOC also aims to help the issue of multiple identities by allowing users to define that they hold other accounts or that their accounts belong to a particular personal identity (via foaf:holdsOnlineAccount or sioc:account_of). Therefore, all the posts or comments made by a particular person using their various associated user accounts across platforms could be identified.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
 <p>Another example of <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=%27data%20spaces%27&type=text&output=html">Data Spaces</a> in action by <a href="http://www.johnbreslin.com/blog">John Breslin</a>.. In this case John visualizes the connections that are exploitable by creating SIOC (<a href="http://rdfs.org/sioc/">Semantically-Interlinked Online Communities</a>) instance data from existing Distributed Collaborative Application profiles (<a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=web%202.0&type=text&output=html">Web 2.0</a> in current parlance). Of course, SIOC is an Ontology for RDF data since it describes the Concepts and Terms for a a network mesh of online communities. Which by implication provides another insight into the realization that the Web we know has always been a &quot;Web of Databases&quot; (federation of Graph Model Databases encapsulated in Data Spaces). The emergence of <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&q=sparql%0D%0A&type=text&output=html">SPARQ</a>L as the standard <a href="http://www.w3.org/TR/rdf-sparql-query/">Query Language for querying RDF Data Sets</a>, alongside the SPARQL Protocol for transmitting SPARQL Queries over HTTP, and the SPARQL Query Results Serialization formats (XML or JSON) Results Serialization Format), basically set the stage truly open and flexible data access across Web Data Space clusters such as: the Blogosphere, Wikispehere, Usenetverse, Linkspaces, Boardscapes, and others.</p> <p> For additional clarity re. my comments above, you can also look at the <a href="http://virtuoso.openlinksw.com/wiki/main/Main/ODSSIOCRef">SPARQL &amp; SIOC Usecase samples document</a> for our <a href="http://virtuoso.openlinksw.com/wiki/main/Main/OdsIndex">OpenLink Data Spaces platform</a>. Bottom line, the Semantic Web and SPARQL aren&#39;t <a href="http://morenews.blogspot.com/2006/09/myth-of-web-20.html"> BORING.</a> In fact, quite the contrary, since they are essential ingredients of a more powerful Web than the one we work with today!</p> <p>Enjoy the rest of John&#39;s post:</p> <blockquote> <p>   <a href="http://www.johnbreslin.com/blog/2006/09/07/creating-connections-between-discussion-clouds-with-sioc/#comments">Creating connections between discussion clouds with SIOC</a>: </p> <p>(Extract from our forthcoming <a href="http://blogtalk.net/Main/Program"> BlogTalk</a> paper about browsers for SIOC.)</p> <p>   <a class="imagelink" title="20060907b.png" href="http://www.johnbreslin.com/blog/wp-content/uploads/2006/09/20060907a.png"><img id="image515" alt="20060907b.png" src="http://www.johnbreslin.com/blog/wp-content/uploads/2006/09/20060907b.png" />   </a> </p> <p>SIOC provides a unified vocabulary for content and interaction description: a semantic layer that can co-exist with existing discussion platforms. Using SIOC, various linkages are created between the aforementioned concepts, which allow new methods of accessing this linked data, including:</p> <ul> <li>     <strong>Virtual Forums</strong>. These may be a gathering of posts or threads which are distributed across discussion platforms, for example, where a user has found posts from a number of blogs that can be associated with a particular category of interest, or an agent identifies relevant posts across a certain timeframe.</li> <li>     <strong>Distributed Conversations</strong>. Trackbacks are commonly used to link blog posts to previous posts on a related topic. By creating links in both directions, not only across blogs but across all types of internet discussions, conversations can be followed regardless of what point or URI fragment a browser enters at.</li> <li>     <strong>Unified Communities</strong>. Apart from creating a web page with a number of relevant links to the blogs or forums or people involved in a particular community, there is no standard way to define what makes up an online community (apart from grouping the people who are members of that community using FOAF or OPML). SIOC allows one to simply define what objects are constituent parts of a community, or to say to what community an object belongs (using sioc:has_part / part_of): users, groups, forums, blogs, etc.</li> <li>     <strong>Shared Topics</strong>. Technorati (a search engine for blogs) and BoardTracker (for bulletin boards) have been leveraging the free-text tags that people associate with their posts for some time now. SIOC allows the definition of such tags (using the subject property), but also enables hierarchial or non-hierarchial topic definition of posts using sioc:topic when a topic is ambiguous or more information on a topic is required. Combining with other Semantic Web vocabularies, tags and topics can be further described using the SKOS organisation system.</li> <li>     <strong>One Person, Many User Accounts</strong>. SIOC also aims to help the issue of multiple identities by allowing users to define that they hold other accounts or that their accounts belong to a particular personal identity (via foaf:holdsOnlineAccount or sioc:account_of). Therefore, all the posts or comments made by a particular person using their various associated user accounts across platforms could be identified.</li> </ul>  </blockquote>  
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-08-28#1030">
  <rss:title>Data Spaces and Web of Databases</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-08-28T19:38:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Note: An updated version of a previously unpublished blog post: Continuing from our recent Podcast conversation, Jon Udell sheds further insight into the essence of our conversation via a âStrategic Developerâ column article titled: Accessing the web of databases. Below, I present an initial dump of a DataSpace FAQ below that hopefully sheds light on the DataSpace vision espoused during my podcast conversation with Jon. What is a DataSpace? A moniker for Web-accessible atomic containers that manage and expose Data, Information, Services, Processes, and Knowledge. What would you typically find in a Data Space? Examples include: Raw Data - SQL, HTML, XML (raw), XHTML, RDF etc. Information (Data In Context) - XHTML (various microformats), Blog Posts (in RSS, Atom, RSS-RDF formats), Subscription Lists (OPML, OCS, etc), Social Networks (FOAF, XFN etc.), and many other forms of applied XML. Web Services (Application/Service Logic) - REST or SOAP based invocation of application logic for context sensitive and controlled data access and manipulation. Persisted Knowledge - Information in actionable context that is also available in transient or persistent forms expressed using a Graph Data Model. A modern knowledgebase would more than likely have RDF as its Data Language, RDFS as its Schema Language, and OWL as its DomainÂ  Definition (Ontology) Language. Actual Domain, Schema, and Instance Data would be serialized using formats such as RDF-XML, N3, Turtle etc). How do Data Spaces and Databases differ? Data Spaces are fundamentally problem-domain-specific database applications. They offer functionality that you would instinctively expect of a database (e.g. AICD data management) with the additonal benefit of being data model and query language agnostic. Data Spaces are for the most part DBMS Engine and Data Access Middleware hybrids in the sense that ownership and control of data is inherently loosely-coupled. How do Data Spaces and Content Management Systems differ?Data Spaces are inherently more flexible, they support multiple data models and data representation formats. Content management systems do not possess the same degree of data model and data representation dexterity. How do Data Spaces and Knowledgebases differ?A Data Space cannot dictate the perception of its content. For instance, what I may consider as knowledge relative to my Data Space may not be the case to a remote client that interacts with it from a distance, Thus, defining my Data Space as Knowledgebase, purely, introduces constraints that reduce its broader effectiveness to third party clients (applications, services, users etc..). A Knowledgebase is based on a Graph Data Model resulting in significant impedance for clients that are built around alternative models. To reiterate, Data Spaces support multiple data models. What Architectural Components make up a Data Space? ORDBMS Engine - for Data Modeling agility (via complex purpose specific data types and data access methods), Data Atomicity, Data Concurrency, Transaction Isolation, and Durability (aka ACID). Virtual Database Engine - for creating a single view of, and access point to, heterogeneous SQL, XML, Free Text, and other data. This is all about Virtualization at the Data Access Level. Web Services Platform - enabling controlled access and manipulation (via application, service, or protocol logic) of Virtualized or Disparate Data. This layer handles the decoupling of functionality from monolithic wholes for function specific invocation via Web Services using either the SOAP or REST approach. Where do Data Spaces fit into the Web&#39;s rapid evolution?They are an essential part of the burgeoning Data Web / Semantic Web. In short, they will take us from data âMash-upsâ (combining web accessible data that exists without integration and repurposing in mind) to âMesh-upsâ (combining web accessible data that exists with integration and repurposing in mind). Where can I see a DataSpace along the lines described, in action? Just look at my blog, and take the journey as follows: Front Door (Web 1.0) Lounge (Web 2.0) via GData or OpenSearch Floor Plan via FOAF or SIOC RDF Data Sets (Graphs) Rest of the house (beyond Web 2.0) sendingÂ  SPARQL Queries to a SPARQL Endpoint. What about other Data Spaces? There are several and I will attempt to categorize along the lines of query method available: Type 1 (Free Text Search over HTTP): Google, MSN, Yahoo!, Amazon, eBay, and most Web 2.0 plays . Type 2 (Free Text Search and XQuery/XPath over HTTP) A few blogs and Wikis (Jon Udell&#39;s and a few others)Type 3 (RDF Data Sets and SPARQL Queryable): Â Â  SIOC enabled sites (aka points of semantic web presence) Â Â  PingTheSemantic Type 4 (Generic Free Text Search, OpenSearch, GData, XQuery/XPath, and SPARQL):Points of Semantic Web presence such as the Data Spaces at: My Blog Data Space (as stated earlier in this post) My General Data Space - (ditto; note that this is currently experimental) What About Data Space aware tools? Â Â  OpenLink Ajax Toolkit - provides Javascript Control level binding to Query Services such as XMLA for SQL, GData for Free Text, OpenSearch for Free Text, SPARQL for RDF, in addition to service specific Web Services (Web 2.0 hosted solutions that expose service specific APIs) Â Â  Semantic Radar - a Firefox Extension Â Â  PingTheSemantic - the Semantic Webs equivalent of Web 2.0&#39;s weblogs.com Â Â  PiggyBank - a Firefox Extension</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Note: An updated version of a previously unpublished blog post:</p>
    <p> Continuing from <a href="http://weblog.infoworld.com/udell/2006/04/28.html">our recent Podcast conversation</a>, Jon Udell sheds further insight into the essence of our conversation via a âStrategic Developerâ column article titled: <a href="http://www.infoworld.com/cgi-bin/redirect?source=rss&url=http://www.infoworld.com/article/06/05/03/77873_19OPstrategic_1.html">Accessing the web of databases</a>. </p> <p> Below, I present an initial dump of a DataSpace FAQ below that hopefully sheds light on the DataSpace vision espoused during my podcast conversation with Jon. </p> <p> What is a DataSpace? <br /> </p> <p>A moniker for Web-accessible atomic containers that manage and expose Data, Information, Services, Processes, and Knowledge.  </p> <p> What would you typically find in a Data Space? Examples include: </p> <ul> <li>Raw Data - SQL, HTML, XML (raw), XHTML, RDF etc.<br />   <br /> </li> <li>Information (Data In Context) - XHTML (various microformats), Blog Posts (in RSS, Atom, RSS-RDF formats), Subscription Lists (OPML, OCS, etc), Social Networks (FOAF, XFN etc.), and many other forms of applied XML.</li>  </ul> <ul> <li>Web Services (Application/Service Logic) - REST or SOAP based invocation of application logic for context sensitive and controlled data access and manipulation.</li> </ul> <ul> <li>Persisted Knowledge - Information in actionable context that is also available in transient or persistent forms expressed using a Graph Data Model. A modern knowledgebase would more than likely have RDF as its Data Language, RDFS as its Schema Language, and OWL as its DomainÂ  Definition  (Ontology) Language. Actual Domain, Schema, and  Instance Data would be serialized using formats such as RDF-XML, N3, Turtle etc).</li> </ul> <p> How do Data Spaces and Databases differ? <br />Data Spaces are fundamentally problem-domain-specific database applications. They offer functionality that you would instinctively expect of a database (e.g. AICD data management) with the additonal benefit of being data model and query language agnostic. Data Spaces are for the most part DBMS Engine and Data Access Middleware hybrids in the sense that ownership and control of data is inherently loosely-coupled. </p> <p>How do Data Spaces and Content Management Systems differ?<br />Data Spaces are inherently more flexible, they support multiple data models and data representation formats. Content management systems do not possess the same degree of data model and data representation dexterity. </p>  <p>How do Data Spaces and Knowledgebases differ?<br />A Data Space cannot dictate the perception of its content. For instance, what I may consider as knowledge relative to my Data Space may not be the case to a remote client that interacts with it from a distance, Thus, defining my Data Space as Knowledgebase, purely, introduces constraints that reduce its broader effectiveness to third party clients (applications, services, users etc..). A Knowledgebase is based on a Graph Data Model resulting in significant impedance for clients that are built around alternative models. To reiterate, Data Spaces support multiple data models.  </p> <p> What Architectural Components make up a Data Space? </p>  <ul> <li>ORDBMS Engine - for Data Modeling agility (via complex purpose specific data types and data access methods), Data Atomicity, Data Concurrency, Transaction Isolation, and Durability (aka ACID).<br />   <br /> </li> <li>Virtual Database Engine - for creating a single view of, and access point to,  heterogeneous SQL, XML, Free Text, and other data. This is all about Virtualization at the Data Access Level.</li> </ul> <ul> <li>Web Services Platform - enabling controlled access and manipulation (via application, service, or protocol logic) of Virtualized or Disparate Data. This layer handles the decoupling of functionality from monolithic wholes for function specific invocation via Web Services using either the SOAP or REST approach.</li> </ul> <br />Where do Data Spaces fit into the Web&#39;s rapid evolution?<br />They are an essential part of the burgeoning Data Web / Semantic Web. In short, they will take us from data âMash-upsâ (combining web accessible data that exists without integration and repurposing in mind) to âMesh-upsâ (combining web accessible data that exists with integration and repurposing in mind).<p> Where can I see a DataSpace along the lines described, in action? </p> <p> Just look at my blog, and take the journey as follows: </p>  <ul> <li>   <a href="http://www.openlinksw.com/blog/%7Ekidehen/">Front Door</a> (Web 1.0)</li> <li>Lounge (Web 2.0) via <a href="http://www.openlinksw.com/GData/127">GData</a> or <a href="http://www.openlinksw.com/weblog/public/search.vspx?blogid=127&type=text&kwds=%27semantic+web%27&amp;OpenSearch">OpenSearch</a> </li> <li>Floor Plan via <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/about.rdf">FOAF</a> or <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com/sioc.rdf">SIOC</a> RDF Data Sets (Graphs)</li>  <li>Rest of the house (beyond Web 2.0) sendingÂ  <a href="http://virtuoso.openlinksw.com/wiki/main/Main/VOSODSSparqlSamples">SPARQL Queries</a> to a <a href="http://myopenlink.net:8890/sparql/">SPARQL Endpoint</a>.<br />  </li> </ul>  <p> What about other Data Spaces? </p> <p> There are several and I will attempt to categorize along the lines of query method available: <br />Type 1 (Free Text Search over HTTP): <br />Google, MSN, Yahoo!, Amazon, eBay, and most Web 2.0 plays . </p> <p> Type 2 (Free Text Search and XQuery/XPath over HTTP) <br />A few blogs and Wikis (Jon Udell&#39;s and a few others)</p>Type 3 (RDF Data Sets and SPARQL Queryable):<br /> <ul> <li>Â Â  <a href="http://esw.w3.org/topic/SIOC/EnabledSites">SIOC enabled sites</a> (aka points of semantic web presence)<br />
</li>  <li>Â Â  <a href="http://pingthesemanticweb.com/">PingTheSemantic</a> <br />  </li> </ul>Type 4 (Generic Free Text Search, OpenSearch, GData, XQuery/XPath, and SPARQL):<br />Points of Semantic Web presence such as the Data Spaces at: <br /> <ul>  <li>
  <a href="http://www.openlinksw.com/dataspace/kidehen@openlinksw.com">My Blog Data Space</a> (as stated earlier in this post)<br />  </li>  <li>
  <a href="http://myopenlink.net:8890/dataspace/kidehen@openlinksw.com">My General Data Space</a> - (ditto; note that this is currently experimental)<br />  </li> </ul> <p>What About Data Space aware tools?<br /> <br /> </p> <ul> <li>Â Â  <a href="http://demo.openlinksw.com/DAV/JS/oat/index.html/">OpenLink Ajax Toolkit </a>- provides Javascript Control level binding to Query Services such as XMLA for SQL, GData for Free Text, OpenSearch for Free Text, SPARQL for RDF, in addition to service specific Web Services (Web 2.0 hosted solutions that expose service specific APIs)</li> <li>Â Â  <a href="http://rdfs.org/sioc/firefox">Semantic Radar </a>- a Firefox Extension</li> <li>Â Â  <a href="http://pingthesemanticweb.com/">PingTheSemantic</a> - the Semantic Webs equivalent of Web 2.0&#39;s weblogs.com</li> <li>Â Â  <a href="http://simile.mit.edu/piggy-bank/">PiggyBank</a> - a Firefox Extension</li> </ul> <p> </p>    
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/vdb/blog/?date=2006-08-10#1025">
  <rss:title>Virtuoso and ODS Update</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-08-10T11:55:26Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Virtuoso and ODS Update We have released an update of Virtuoso Open Source Edition and the OpenLink Data Spaces suite. This marks the coming of age of our RDF and SPARQL efforts. We have the new SQL cost model with SPARQL awareness, we have applications which present much of their data as SIOC, FOAF, ATOM OWL and other formats. We continue refining these technologies. Our next roadmap item is mapping relational data into RDF and offering SPARQL access to relational data without data duplication. Expect a white paper about this soon.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<div>
<div style="display:none;">Virtuoso and ODS Update</div>
<p>We have released an update of <a href="http://virtuoso.openlinksw.com" id="link-id0x1b0d5100">Virtuoso</a> Open Source Edition and the <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id0x1770ad30">OpenLink Data Spaces</a> suite.</p>
<p>This marks the coming of age of our <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x1a1c6800">RDF</a> and <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1779b790">SPARQL</a> efforts. We have the new <a href="http://dbpedia.org/resource/SQL" id="link-id0x170db778">SQL</a> cost model with SPARQL awareness, we have applications which present much of their <a href="http://dbpedia.org/resource/Data" id="link-id0x18ab4600">data</a> as SIOC, FOAF, ATOM OWL and other formats.</p>
<p>We continue refining these technologies. Our next roadmap item is mapping relational data into RDF and offering SPARQL access to relational data without data duplication. Expect a white paper about this soon.</p>
</div>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/weblog/oerling/?date=2006-08-10#1024">
  <rss:title>Virtuoso and ODS Update</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-08-10T11:06:01Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">We have released an update of Virtuoso Open Source Edition and the OpenLink Data Spaces suite. This marks the coming of age of our RDF and SPARQL efforts. We have the new SQL cost model with SPARQL awareness, we have applications which present much of their data as SIOC, FOAF, ATOM OWL and other formats. We continue refining these technologies. Our next roadmap item is mapping relational data into RDF and offering SPARQL access to relational data without data duplication. Expect a white paper about this soon.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>We have released an update of <a href="http://virtuoso.openlinksw.com" id="link-id0xddc9c48">Virtuoso</a> Open Source Edition and the <a href="http://dbpedia.org/resource/OpenLink_Data_Spaces" id="link-id0x199d1fc0">OpenLink Data Spaces</a> suite.</p>
<p>This marks the coming of age of our <a href="http://dbpedia.org/resource/Resource_Description_Framework" id="link-id0x19347570">RDF</a> and <a href="http://dbpedia.org/resource/SPARQL" id="link-id0x1b202218">SPARQL</a> efforts. We have the new <a href="http://dbpedia.org/resource/SQL" id="link-id0x18bf3c08">SQL</a> cost model with SPARQL awareness, we have applications which present much of their <a href="http://dbpedia.org/resource/Data" id="link-id0x1a161428">data</a> as SIOC, FOAF, ATOM OWL and other formats.</p>
<p>We continue refining these technologies. Our next roadmap item is mapping relational data into RDF and offering SPARQL access to relational data without data duplication. Expect a white paper about this soon.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-07-04#995">
  <rss:title>Standards as social contracts</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-07-04T17:25:51Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Standards as social contracts: &quot;Looking at Dave Winer&#39;s efforts in evangelizing OPML, I try to draw some rough lines into what makes a de-facto standard. De Facto standards are made and seldom happen on their own. In this entry, I look back at the history of HTML, RSS, the open source movement and try to draw some lines as to what makes a standard. &quot; (Via Tristan Louis.) I posted a comment to the Tristan Louis&#39; post along the following lines: Analysis is spot on re. the link between de facto standardization and bootstrapping. Likewise, the clear linkage between boostrapping and connected communities (a variation of the social networking paradigm). Dave built a community around a XML content syndication and subscription usecase demo that we know today as the blogosphere. Superficially, one may conclude that Semantic Web vision has suffered to date from a lack a similar bootstrap effort. Whereas in reality, we are dealing with &quot;time and context&quot; issues that are critical to the base understanding upon which a &quot;Dave Winer&quot; style bootstrap for the Semantic Web would occur. Personally, I see the emergence of Web 2.0 (esp. the mashups phenomenon) as the &quot;time and context&quot; seeds from which the Semantic Web bootstrap will sprout. I see shared ontologies such as FOAF and SIOC leading the way (they are the RSS 2.0&#39;s of the Semantic Web IMHO).</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<blockquote>
<p>
<a href="http://www.tnl.net/blog/2006/06/07/standards-as-social-contracts/#comments">Standards as social contracts</a>: &quot;Looking at Dave Winer&#39;s efforts in evangelizing OPML, I try to draw some rough lines into what makes a de-facto standard. De Facto standards are made and seldom happen on their own. In this entry, I look back at the history of HTML, RSS, the open source movement and try to draw some lines as to what makes a standard.
</p>
<p>
 <a href="http://feeds.tristanlouis.com/~a/TNLnet?a=nXIQUu"><img src="http://feeds.tristanlouis.com/~a/TNLnet?i=nXIQUu" border="0" />
 </a>
</p>
<div class="feedflare">
 <a href="http://feeds.tristanlouis.com/~f/TNLnet?a=dklI2jYY"><img src="http://feeds.tristanlouis.com/~f/TNLnet?i=dklI2jYY" border="0" />
 </a> <a href="http://feeds.tristanlouis.com/~f/TNLnet?a=HoauA2Ma"><img src="http://feeds.tristanlouis.com/~f/TNLnet?i=HoauA2Ma" border="0" /></a> <a href="http://feeds.tristanlouis.com/~f/TNLnet?a=DxOLN3Br"><img src="http://feeds.tristanlouis.com/~f/TNLnet?i=DxOLN3Br" border="0" /></a> <a href="http://feeds.tristanlouis.com/~f/TNLnet?a=zU2uLdOm"><img src="http://feeds.tristanlouis.com/~f/TNLnet?i=zU2uLdOm" border="0" /></a>
</div>&quot;

<p>(Via <a href="http://www.tnl.net/blog">Tristan Louis</a>.)</p>
</blockquote>
<p>I posted a comment to the Tristan Louis&#39; post along the following lines:</p>
<p>Analysis is spot on re. the link between de facto standardization and bootstrapping. Likewise, the clear linkage between boostrapping and connected communities (a variation of the social networking paradigm). </p>

<p>Dave built a community around a XML content syndication and subscription usecase demo that we know today as the blogosphere. Superficially, one may conclude that Semantic Web vision has suffered to date from a lack a similar bootstrap effort. Whereas in reality, we are dealing with &quot;time and context&quot; issues that are critical to the base understanding upon which a &quot;Dave Winer&quot; style bootstrap for the Semantic Web would occur.</p>

<p>Personally, I see the emergence of Web 2.0 (esp. the mashups phenomenon) as the &quot;time and context&quot; seeds from which the Semantic Web bootstrap will sprout. I see shared ontologies such as <a href="http://oplussol5.usnet.private:8893/foaf">FOAF</a> and <a href="http://rdfs.org/sioc/">SIOC</a> leading the way (they are the RSS 2.0&#39;s of the Semantic Web IMHO).</p>

]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-05-11#973">
  <rss:title>SPARQL Parameterized Queries (Virtuoso using SPARQL in SQL)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-05-11T18:54:47Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">SPARQL with SQL (Inline) Virtuoso extends its SQL3 implementation with syntax for integrating SPARQL into queries and subqueries.Thus, as part of a SQL SELECT query or subquery, one can write the SPARQL keyword and a SPARQL query as part of query text processed by Virtuoso&#39;s SQL Query Processor. Example 1 (basic) : Using Virtuoso&#39;s Command line or the Web Based ISQL utility type in the following (note: &quot;SQL&gt;&quot; is the command line prompt for the native ISQL utility): SQL&gt; sparql select distinct ?p where { graph ?g { ?s ?p ?o } }; Which will return the following: p varchar ---------- http://example.org/ns#b http://example.org/ns#d http://xmlns.com/foaf/0.1/name http://xmlns.com/foaf/0.1/mbox ... Example 2 (a subquery variation): SQL&gt; select distinct subseq (p, strchr (p, &#39;#&#39;)) as fragment from (sparql select distinct ?p where { graph ?g { ?s ?p ?o } } ) as all_predicates where p like &#39;%#%&#39; ; fragment varchar ---------- #query #data #name #comment ... Parameterized Queries: You can pass parameters to a SPARQL query using a Virtuoso-specific syntax extension. &#39;??&#39; or &#39;$?&#39; indicates a positional parameter similar to &#39;?&#39; in standard SQL. &#39;??&#39; can be used in graph patterns or anywhere else where a SPARQL variable is accepted. The value of a parameter should be passed in SQL form, i.e. this should be a number or an untyped string. An IRI ID can not be passed, but an absolute IRI can. Using this notation, a dynamic SQL capable client (ODBC, JDBC, ADO.NET, OLEDB, XMLA, or others) can execute parametrized SPARQL queries using parameter binding concepts that are common place in dynamic SQL. Which implies that existing SQL applications and development environments (PHP, Ruby, Python, Perl, VB, C#, Java, etc.) are capable of issuing SPARQL queries via their existing SQL bound data access channels against RDF Data stored in Virtuoso. Note: This is the Virtuoso equivalent of a recently published example using Jena (a Java based RDF Triple Store). Example: Create a Virtuoso Function by execting the following: SQL&gt; create function param_passing_demo (); { declare stat, msg varchar; declare mdata, rset any; exec (&#39;sparql select ?s where { graph ?g { ?s ?? ?? }}&#39;, stat, msg, vector (&#39;http://www.w3.org/2001/sw/DataAccess/tests/data/Sorting/sort-0#int1&#39;, 4 ), -- Vector of two parameters 10, -- Max. result-set rows mdata, -- Variable for handling result-set metadata rset -- Variable for handling query result-set ); return rset[0][0]; } Test new &quot;param_passing_demo&quot; function by executing the following: SQL&gt; select param_passing_demo (); Which returns: callret VARCHAR _______________________________________________________________________________ http://www.w3.org/2001/sw/DataAccess/tests/data/Sorting/sort-0#four 1 Rows. -- 00000 msec. Â  Using SPARQL in SQL Predicates: A SPARQL ASK query can be used as an argument of the SQL EXISTS predicate. create function sparql_ask_demo () returns varchar { if (exists (sparql ask where { graph ?g { ?s ?p 4}})) return &#39;YES&#39;; else return &#39;NO&#39;; }; Test by executing: SQL&gt; select sparql_ask_demo (); Which returns: _________________________ YES</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<h2>SPARQL with SQL (Inline) </h2>

<p>Virtuoso extends its SQL3 implementation with syntax for integrating SPARQL into queries and subqueries.Thus, as part of a SQL SELECT query or subquery, one can write the SPARQL keyword and a SPARQL query as part of query text processed by Virtuoso&#39;s SQL Query Processor.</p>

<h4>Example 1 (basic) : </h4>
<p>Using Virtuoso&#39;s  Command line or the Web Based ISQL utility type in the following (note: &quot;SQL&gt;&quot; is the command line prompt for the native ISQL utility): </p>
<pre>SQL&gt; sparql select distinct ?p where { graph ?g { ?s ?p ?o } };</pre>
<p>Which will return the following: </p>
<blockquote>
  <pre>	  p varchar
     ----------
     http://example.org/ns#b
     http://example.org/ns#d
     http://xmlns.com/foaf/0.1/name
     http://xmlns.com/foaf/0.1/mbox
     ...   </pre>
</blockquote>
<h4>Example 2 (a subquery variation):</h4>

<pre>SQL&gt; select distinct subseq (p, strchr (p, &#39;#&#39;)) as fragment
 from (sparql select distinct ?p where { graph ?g { ?s ?p ?o } } ) as all_predicates
 where p like &#39;%#%&#39; ;</pre>
<blockquote>
  <pre>
     fragment varchar
     ----------
     #query
     #data
     #name
     #comment
     ...</pre>
</blockquote>
<h3>Parameterized Queries:</h3>
 <p>You can pass parameters to a SPARQL query using a Virtuoso-specific syntax extension. &#39;??&#39; or &#39;$?&#39; indicates a positional parameter similar to &#39;?&#39; in standard SQL. &#39;??&#39; can be used in graph patterns or anywhere else where a SPARQL variable is accepted. The value of a parameter should be passed in SQL form, i.e. this should be a number or an untyped string. An IRI ID can not be passed, but an absolute IRI can.
Using this notation, a dynamic SQL capable client (ODBC, JDBC, ADO.NET, OLEDB, XMLA, or others) can execute parametrized SPARQL queries using parameter binding concepts that are common place in dynamic SQL. Which implies that existing SQL applications and development environments (PHP, Ruby, Python, Perl, VB, C#, Java, etc.) are capable of issuing SPARQL queries via their existing SQL bound data access channels against RDF Data stored in Virtuoso. </p>
 <p>Note: This is the Virtuoso equivalent of a <a href="http://seaborne.blogspot.com/2006/05/parameterized-queries_07.html">recently published example using Jena </a>(a Java based RDF Triple Store).</p>
 <h3>Example:</h3>

<p>Create a Virtuoso Function by execting the following: </p>

<pre>SQL&gt; create function param_passing_demo ();
 {
 	declare stat, msg varchar;
 	declare mdata, rset any;
 	exec (&#39;sparql select ?s where { graph ?g { ?s ?? ?? }}&#39;,
 			stat, msg,
 			vector (&#39;http://www.w3.org/2001/sw/DataAccess/tests/data/Sorting/sort-0#int1&#39;,
 		  		   4 ),	-- Vector of two parameters 
			10,			-- Max. result-set rows
			mdata, 		-- Variable for handling result-set metadata
 		 	rset   		-- Variable for handling query result-set
		 ); 
     return rset[0][0];
 }

</pre>
Test new &quot;param_passing_demo&quot; function by executing the following: <br />
<pre>SQL&gt; select param_passing_demo ();
</pre>
<p>Which returns: </p>
<blockquote>
  <pre>
callret VARCHAR
 _______________________________________________________________________________</pre>
  <pre>http://www.w3.org/2001/sw/DataAccess/tests/data/Sorting/sort-0#four</pre>
  <pre>1 Rows. -- 00000 msec.</pre>
</blockquote>
<h3>Â </h3>

<h3>Using SPARQL in SQL Predicates:</h3>

<p>A SPARQL ASK query can be used as an argument of the SQL EXISTS predicate.</p>

<pre>create function sparql_ask_demo () returns varchar
  {
 		if (exists (sparql ask where { graph ?g { ?s ?p 4}})) return &#39;YES&#39;;
 		else return &#39;NO&#39;;
   };
</pre>

<p>
<br />
    Test by executing: </p>
<pre>SQL&gt; select sparql_ask_demo ();
</pre>
<p>Which returns:</p>
<pre>_________________________
YES</pre>
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2006-04-05#947">
  <rss:title>Swoogle knows how Semantic Web ontologies are used</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2006-04-05T20:00:36Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Swoogle knows how Semantic Web ontologies are used: &quot; The Dublin Core Metadata Initiative is updating the RDF expression of DC and might add range restrictions to some properties. Mikael Nilsson wondered if we would use the Swoogle Semantic Web search engine to see what types of values are being used with DC properties. This kind of query is just the ticket for Swoogle. Well, almost. The current web-based interface supports a limited number of query types. Many more can be asked if you use SQL directly to query Swoogle’s underlying databases. We don’t want to provide a direct SQL query service over the main Swoogle database because it’s easy to ask a query that will take a looooooong time to answer and some could even crash the database server. We are planning to put up a second server with a copy of the database and we give Swoogle Power Users (SPUs) access to it. We ran a simple SQL query to generate some initial data for Mikael showing fall of the DC properties. For each one, we list all of the ranges that values were drawn from and the number of separate documents and triples for each combination. For example Property Range Documents Triples dc:creater rdfs:Literal 32 648 dc:creator rdfs:Literal 234655 2477665 dc:creator wn:Person 2714 1138250 dc:creator cc:Agent 4090 6359 dc:creator foaf:Person 2281 5969 dc:creator foaf:Agent 1723 3234 Notice that the first property in this partial table is an obvious typo. You can see the complete table as pdf file or as an excel spreadsheet. [Tim Finin, UMBC ebiquity lab] &quot; (Via Planet RDF.)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>
<a href="http://ebiquity.umbc.edu/blogger/2006/04/04/swoogle-knows-how-semantic-web-ontologies-are-used/">Swoogle knows how Semantic Web ontologies are used</a>: &quot;</p>
<div xmlns="http://www.w3.org/1999/xhtml">
<div>
<p>
    <img src="http://swoogle.umbc.edu/images/logo_mini.png" align="right" alt="" />
  </p>
<p>The <a href="http://dublincore.org/">Dublin Core Metadata
Initiative</a> is updating the RDF expression of DC and might add
range restrictions to some properties. Mikael Nilsson wondered if
we would use the <a href="http://swoogle.umbc.edu/">Swoogle
Semantic Web search engine</a> to see what types of values are
being used with DC properties.</p>
<p>This kind of query is just the ticket for Swoogle. Well, almost.
The current web-based interface supports a limited number of query
types. Many more can be asked if you use SQL directly to query
Swoogle’s underlying databases. We don’t want to provide a direct
SQL query service over the main Swoogle database because it’s easy
to ask a query that will take a looooooong time to answer and some
could even crash the database server. We are planning to put up a
second server with a copy of the database and we give <em>Swoogle
Power Users</em> (SPUs) access to it.</p>
<p>We ran a simple SQL query to generate some initial data for
Mikael showing fall of the DC properties. For each one, we list all
of the ranges that values were drawn from and the number of
separate documents and triples for each combination. For
example</p>
<table border="1" align="center" cellpadding="6" bgcolor="#E9E9E9">
<tr bgcolor="#333300">
<td>
<div align="center">
      <font color="#FFFFFF"><strong>Property</strong>
      </font>
      </div>
</td>
<td>
<div align="center">
      <font color="#FFFFFF"><strong>Range</strong>
      </font>
      </div>
</td>
<td>
<div align="center">
      <font color="#FFFFFF"><strong>Documents</strong>
      </font>
      </div>
</td>
<td>
<div align="center">
      <font color="#FFFFFF"><strong>Triples</strong>
      </font>
      </div>
</td>
</tr>
<tr>
<td>dc:creater</td>
<td>rdfs:Literal</td>
<td>
<div align="right">32</div>
</td>
<td>
<div align="right">648</div>
</td>
</tr>
<tr>
<td>dc:creator</td>
<td>rdfs:Literal</td>
<td>
<div align="right">234655</div>
</td>
<td>
<div align="right">2477665</div>
</td>
</tr>
<tr>
<td>dc:creator</td>
<td>wn:Person</td>
<td>
<div align="right">2714</div>
</td>
<td>
<div align="right">1138250</div>
</td>
</tr>
<tr>
<td>dc:creator</td>
<td>cc:Agent</td>
<td>
<div align="right">4090</div>
</td>
<td>
<div align="right">6359</div>
</td>
</tr>
<tr>
<td>dc:creator</td>
<td>foaf:Person</td>
<td>
<div align="right">2281</div>
</td>
<td>
<div align="right">5969</div>
</td>
</tr>
<tr>
<td>dc:creator</td>
<td>foaf:Agent</td>
<td>
<div align="right">1723</div>
</td>
<td>
<div align="right">3234</div>
</td>
</tr>
</table>
<p>Notice that the first property in this partial table is an
obvious typo. You can see the complete table as <a href="http://www.csee.umbc.edu/~finin/noindex/dc/dcPropertiesRanges.pdf">
pdf</a> file or as an excel <a href="http://www.csee.umbc.edu/~finin/noindex/dc/dcPropertiesRanges.xls">
spreadsheet</a>.</p>
<p>[Tim Finin, UMBC ebiquity lab]</p>
</div>
</div>&quot;

<p>(Via <a href="http://planetrdf.com/">Planet RDF</a>.)</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-11-14#902">
  <rss:title>This Week&amp;rsquo;s Semantic Web</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2005-11-14T19:44:03Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">(Via Danny Ayers.): This Weekâs Semantic Web: &quot;Ok, my first attempt at a round-up (in response to Philâs observation of Planetary damage). Thanks to the conference thereâs loads more here than thereâs likely to be subsequent weeks, although itâs still only a fairly random sample and some of the links here are to heaps of other resourcesâ¦ Incidentally, if anyoneâs got a list/links for SemWeb-related blogs that arenât on Planet RDF, Iâd be grateful for a pointer. PS. Ok, I forgetâ¦ are there any blogs that arenât on Daveâs list yet..? Quote of the week: In the Semantic Web, it is not the Semantic which is new, it is the Web which is new. - Chris Welty, IBM (lifted from TimBLâs slides) Events 4th International Semantic Web Conference - happened this week, see : ISWC2005 Semantic Bank Semantic Desktop Workshop, 9-13 December 2005, Berlin Semantic Web Applications and Perspectives/Workshop (SWAP2005), 14-16 December, 2005 Jena User Conference - May 10-11 2006, Bristol UK Docs etc Conference highlights on the #swig chump: 2005-11-06, -07, -08, -09, -10; Ianâs notes; Johnâs resources; Leoâs stories; Uldisâ call to action; del.icio.us/iswc2005; flickr/iswc2005; foaf-moblog. Slides from Sir TimBLâs conference keynotes: Semantic Web for the Industry, Putting the Web back in Semantic Web Daniel Weitznerâs keynote: Privacy, Provenance, Property and Personhood Long-time SW researcher Stefan Decker now has a blog, inspirationally entitled Stefan Decker on the Semantic Web. (Stefanâs one of the head honchos at DERI). Sample snippet: I just noticed the article from Dan Zambonini âIs Web 2.0 killing the Semantic Web?â. From my perspective the article shows a misconception that people seems to have around the Semantic Web: the Semantic Web effort itself is not provide applications (like the Web 2.0 meme indicates) - it rather provides standards to interlink applications. Leigh Dodds has two pieces demonstrating neat facilities offered by ARQ the SPARQL query API for Java: parameterised queries and extension functions. A new W3C Working Group has been chartered: Rule Interchange Format WG - â to produce a core rule language plus extensions which together allow rules to be translated between rule languages and thus transferred between rule systems.â. As noted by dajobe, phase 1 includes making a new XML syntax for RDFâ¦ UMBC Semantic Web Reference Card - if you only print one thing this yearâ¦or did you already do the SPARQL Reference card..? WebDescription - root wiki page for collecting notes on web description languages (ESW Wiki, announcement) Bot - IRC/Jabber chat bots that are either in use by Semantic Web developers or use Semantic Web technologies (ESW Wiki) microformat FAQs for RDF fans (ESW Wiki) W3C working draft : WSDL 2.0 - RDF Mapping SKOS (Simple Knowledge Organisation System) updated drafts: SKOS Core Vocabulary Specification, SKOS Core Guide working draft: SPARQL Protocol for RDF Using WSDL 1.1 A relational algebra for SPARQL, Note on database layouts for SPARQL datastores (PDFs, Richard Cyganiak, HP) Amateur Fiction Online - The Web of Community Trust A Case Study in Community Focused Design for the SemanticWeb (PDF) Building a Semantic Wiki - IEEE article. See also: SemperWiki - Semantic Personal Wiki, WikSAR - Towards a Semantic Wiki Experience Software and stuff Semantic Web Challenge applications (winner: CONFOTO - congrats bengee!) Piggy Bank 2.1.1 released. IRIS is a semantic desktop application framework that enables users to create a âpersonal mapâ across their office-related information objects. IRIS includes a machine-learning platform to help automate this process. It provides âdashboardâ views, contextual navigation, and relationship-based structure across an extensible suite of office applications, including a calendar, web and file browser, e-mail client, and instant messaging client. (open source release due Jan 2006) MKSearch - âA new kind of search engineâ - RDF-backed (Sesame) with Web crawler, extracts and indexes metadata. FOAFRealm - Our goal is to design and implement D-FOAF, a distributed authentication and trust infrastructure without a centralised authority. D-FOAF will be a backbone for trust applications based on social relationships and will establish identity of users similar to the way we establish identify and trust in real life. Perl Net::Flickr::RDF WordPress SIOC (Semantically-Interlinked Online Communities) plugin updated (just copy wp-sioc.php into the root of your WP install and it just works) OntoMedia is intended for the representation of heterogenous media through description of the semantic content of that media. The representation may be limited to the description of some or all of the elements contained within the source or may include information regarding the narrative relationship that these elements have both to the media and to each other. mSpace is an interaction model to help explore relationships in information - âImagine Google on iTunesâ Blog post title of the week: Donât give me that monkey-ass Web 1.0, either - Uche Ogbuji Alsoâ¦a new threat to Semantic Web developers has been discovered: typhoid!, and the key to the Webâs full potential isâ¦Tetris.&quot;</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
 <p>(Via <a href="http://dannyayers.com">Danny Ayers</a>.):</p>  <p><a href="http://dannyayers.com/archives/2005/11/13/this-weeks-semantic-web/">This Weekâs Semantic Web</a>: </p><p>&quot;Ok, my first attempt at a round-up (in response to Philâs observation of <a href="http://weblog.philringnalda.com/?p=1008">Planetary damage</a>). Thanks to the conference thereâs loads more here than thereâs likely to be subsequent weeks, although itâs still only a fairly random sample and some of the links here are to heaps of other resourcesâ¦<br /> <em>Incidentally, if anyoneâs got a list/links for SemWeb-related blogs that arenât on <a href="http://planetrdf.com">Planet RDF</a>, Iâd be grateful for a pointer. PS. Ok, I forgetâ¦ are there any blogs that arenât on Daveâs <a href="http://journal.dajobe.org/journal/2003/07/semblogs/">list</a> yet..?</em></p> 	<p>Quote of the week:</p> 	<blockquote><p> In the Semantic Web, it is not the Semantic which is new, it is the Web which is new. </p></blockquote> 	<p>- <a href="http://www.research.ibm.com/people/w/welty/">Chris Welty</a>, IBM (lifted from TimBLâs <a href="http://www.w3.org/2005/Talks/1110-iswc-tbl/">slides</a>)</p> 	<h4>Events</h4> 	<ul> <li><a href="http://iswc2005.semanticweb.org/">4th International Semantic Web Conference</a> - happened this week, see : <a href="http://simile.mit.edu/conference/iswc2005/">ISWC2005 Semantic Bank</a></li> 	<li><a href="http://www.gnowsis.org/Events/HackBerlin2005">Semantic Desktop Workshop</a>, 9-13 December 2005, Berlin</li> 	<li><a href="http://trinity.dit.unitn.it/vikef/swap2005/">Semantic Web Applications and Perspectives/Workshop</a> (SWAP2005), 14-16 December, 2005</li> 	<li><a href="http://jena.hpl.hp.com/juc2006"> Jena User Conference</a> - May 10-11 2006, Bristol UK</li> 	</ul> 	<h4>Docs etc</h4> 	<ul> 	<li> Conference highlights on the #swig chump: <a href="http://swig.xmlhack.com/2005/11/06/2005-11-06.html">2005-11-06</a>, <a href="http://swig.xmlhack.com/2005/11/06/2005-11-07.html">-07</a>, <a href="http://swig.xmlhack.com/2005/11/06/2005-11-08.html">-08</a>, <a href="http://swig.xmlhack.com/2005/11/06/2005-11-09.html">-09</a>, <a href="http://swig.xmlhack.com/2005/11/06/2005-11-10.html">-10</a>; Ianâs <a href="http://internetalchemy.org/tag/iswc2005">notes</a>; Johnâs <a href="http://www.johnbreslin.com/blog/2005/11/06/iswc-2005/">resources</a>; Leoâs <a href="http://leobard.twoday.net/topics/SemWeb">stories</a>; Uldisâ <a href="http://captsolo.net/info/blog_a.php/2005/11/12/iswc_2005_do_the_right_thing">call to action</a>; <a href="http://del.icio.us/tag/iswc2005">del.icio.us/iswc2005</a>; <a href="http://www.flickr.com/photos/tags/iswc2005/">flickr/iswc2005</a>; <a href="http://www.foaf-project.org/2004/media/date/2005/11/">foaf-moblog</a>. </li> 	<li>Slides from Sir TimBLâs conference keynotes: <a href="http://www.w3.org/2005/Talks/1107-iswc-tbl/">Semantic Web for the Industry</a>, <a href="http://www.w3.org/2005/Talks/1110-iswc-tbl/">Putting the Web back in Semantic Web</a></li> 	<li>Daniel Weitznerâs keynote: <a href="http://www.w3.org/2005/Talks/1110-p4-semweb-iswc/">Privacy, Provenance, Property and Personhood</a></li> 	<li>Long-time SW researcher <a href="http://www.stefandecker.org">Stefan Decker</a> now has a blog, inspirationally entitled <a href="http://www.stefandecker.org/blog/">Stefan Decker on the Semantic Web</a>. (Stefanâs one of the head honchos at <a href="http://www.deri.ie/">DERI</a>). Sample snippet:<br /> 	<blockquote><p> I just noticed the article from Dan Zambonini â<a href="http://www.oreillynet.com/pub/wlg/8013?CMP=OTC-TY3388567169">Is Web 2.0 killing the Semantic Web?</a>â. From my perspective the article shows a misconception that people seems to have around the Semantic Web: the Semantic Web effort itself is not provide applications (like the <a href="http://www.oreillynet.com/pub/a/oreilly/tim/news/2005/09/30/what-is-web-20.html">Web 2.0 meme</a> indicates)  - it rather provides standards to interlink applications. </p></blockquote> 	</li> 	<li>Leigh Dodds has two pieces demonstrating neat facilities offered by <a href="http://jena.sourceforge.net/ARQ">ARQ</a> the SPARQL query API for Java: <a href="http://www.ldodds.com/blog/archives/000251.html">parameterised queries</a> and <a href="http://www.ldodds.com/blog/archives/000252.html">extension functions</a>. </li> 	<li>A new W3C Working Group has been chartered: <a href="http://www.w3.org/2005/rules/wg/charter">Rule Interchange Format WG</a> - <em>â to produce a core rule language plus extensions which together allow rules to be translated between rule languages and thus transferred between rule systems.â</em>. As noted by <a href="http://journal.dajobe.org/journal/">dajobe</a>, phase 1 includes making a new XML syntax for RDFâ¦</li> 	<li><a href="http://ebiquity.umbc.edu/resource/html/id/94/">UMBC Semantic Web Reference Card</a> <em>- if you only print one thing this yearâ¦or did you already do the <a href="http://www.dajobe.org/2005/04-sparql/">SPARQL Reference card</a>..?</em></li> 	<li><a href="http://esw.w3.org/topic/WebDescription">WebDescription</a> - root wiki page for collecting notes on web description languages (ESW Wiki, <a href="http://lists.w3.org/Archives/Public/public-web-http-desc/2005Nov/0000.html">announcement</a>)</li> 	<li><a href="http://esw.w3.org/topic/Bot">Bot</a> - IRC/Jabber chat bots that are either in use by Semantic Web developers or use Semantic Web technologies (ESW Wiki)</li> 	<li><a href="http://microformats.org/wiki/faqs-for-rdf">microformat FAQs for RDF fans</a> (ESW Wiki)</li> 	<li> W3C working draft : <a href="http://www.w3.org/TR/wsdl20-rdf/">WSDL 2.0 - RDF Mapping</a></li> 	<li>SKOS (Simple Knowledge Organisation System) updated drafts: <a href="http://www.w3.org/TR/swbp-skos-core-spec">SKOS Core Vocabulary Specification</a>, <a href="http://www.w3.org/TR/swbp-skos-core-guide">SKOS Core Guide</a></li> 	<li>working draft: <a href="http://www.w3.org/TR/sprot11/">SPARQL Protocol for RDF Using WSDL 1.1</a></li> 	<li><a href="http://www.hpl.hp.com/techreports/2005/HPL-2005-170.html">A relational algebra for SPARQL</a>, <a href="http://www.hpl.hp.com/techreports/2005/HPL-2005-171.html">Note on database layouts for SPARQL datastores</a> (PDFs, Richard Cyganiak, HP)</li> 	<li><a href="http://eprints.ecs.soton.ac.uk/11042/">Amateur Fiction Online</a> - The Web of Community Trust A Case Study in Community Focused Design for the SemanticWeb (<a href="http://eprints.ecs.soton.ac.uk/11042/01/case_study.pdf">PDF</a>)</li> 	<li><a href="http://dsonline.computer.org/portal/site/dsonline/menuitem.9ed3d9924aeb0dcd82ccc6716bbe36ec/index.jsp?&pName=dso_level1&path=dsonline/0511&file=x5sem.xml&xsl=article.xsl">Building a Semantic Wiki</a> - IEEE article. See also: <a href="http://m3pe.org/semperwiki/">SemperWiki - Semantic Personal Wiki</a>, <a href="http://wiki.navigable.info/"> WikSAR - Towards a Semantic Wiki Experience</a> <br /> </li> 	</ul> 	<h4>Software and stuff</h4> 	<ul> 	<li><a href="http://challenge.semanticweb.org/">Semantic Web Challenge</a> applications (winner: <a href="http://www.confoto.org/">CONFOTO</a> - congrats bengee!)</li> 	<li><a href="http://simile.mit.edu/piggy-bank/">Piggy Bank 2.1.1</a> released.</li> 	<li> <a href="http://www.openiris.org/">IRIS</a> is a semantic desktop application framework that enables users to create a âpersonal mapâ across their office-related information objects. IRIS includes a machine-learning platform to help automate this process.  It provides âdashboardâ views, contextual navigation, and relationship-based structure across an extensible suite of office applications, including a calendar, web and file browser, e-mail client, and instant messaging client.<br /> <em>(open source release due Jan 2006)</em> </li> 	<li><a href="http://www.mksearch.mkdoc.org/">MKSearch</a> - <em>âA new kind of search engineâ</em> - RDF-backed (Sesame) with Web crawler, extracts and indexes metadata.</li> 	<li><a href="http://www.foafrealm.org">FOAFRealm</a> - Our goal is to design and implement D-FOAF, a distributed authentication and trust infrastructure without a centralised authority. D-FOAF will be a backbone for trust applications based on social relationships and will establish identity of users similar to the way we establish identify and trust in real life.</li> 	<li>Perl <a href="http://search.cpan.org/dist/Net-Flickr-RDF-1.1/">Net::Flickr::RDF</a></li> 	<li>WordPress <a href="http://rdfs.org">SIOC</a>  (Semantically-Interlinked Online Communities)  plugin updated (just copy <a href="http://sw.deri.org/svn/sw/2005/08/sioc/wordpress/wp-sioc.php">wp-sioc.php</a> into the root of your WP install and it <em>just works</em>)</li> 	<li><a href="http://ontomedia.ecs.soton.ac.uk/">OntoMedia</a> is intended for the representation of heterogenous media through description of the semantic content of that media. The representation may be limited to the description of some or all of the elements contained within the source or may include information regarding the narrative relationship that these elements have both to the media and to each other.</li> 	<li><a href="http://mspace.fm/">mSpace</a> is an interaction model to help explore relationships in information - <em>âImagine Google on iTunesâ</em></li> 	</ul> 	<p>Blog post title of the week: </p> 	<blockquote><p> <a href="http://copia.ogbuji.net/blog/2005-11-12/Don_t_give">Donât give me that monkey-ass Web 1.0, either</a> </p></blockquote> 	<p>- <a href="http://copia.ogbuji.net/blog/">Uche Ogbuji</a></p> 	<p>Alsoâ¦a new threat to Semantic Web developers has been discovered: <a href="http://planb.nicecupoftea.org/archives/001309.html">typhoid</a>!, and  the key to the Webâs full potential isâ¦<a href="http://www.foaf-project.org/2004/media/2005/11/07/3448">Tetris</a>.&quot; </p>  
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-10-26#882">
  <rss:title>Breaking the Web Wide Open! </rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2005-10-26T19:28:47Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Marc Canter&#39;s Breaking the Web Wide Open! article is something I found pretty late (by my normal discovery standards). This was partly due to the pre- and post- Web 2.0 event noise levels that have dumped the description of an important industry inflection into the &quot;Bozo Bin&quot; of many. Personally, I think we shouldn&#39;t confuse the Web 2.0 traditional-pitch-fest conference with an attempt to identify an important industry inflection). Anyway, Marc&#39;s article is a very refreshing read because it provides a really good insight into the general landscape of a rapidly evolving Web alongside genuine appreciation of our broader timeless pursuit of &quot;Openness&quot;. To really help this document provide additional value have scrapped the content of the original post and dumped it below so that we can appreciate the value of the links embedded within the article (note: thanks to Virtuoso I only had to paste the content into my blog, the extraction to my Linkblog and Blog Summary Pages are simply features of my Virtuoso based Blog Engine):Breaking the Web Wide Open! (complete story)Even the web giants like AOL, Google, MSN, and Yahoo need to observe these open standards, or they&#39;ll risk becoming the &quot;walled gardens&quot; of the new web and be coolio no more.Marc Canter [Broadband Mechanics, Inc.] | POSTED: 09.26.05 @12:00Editorial Note: Several months ago, AlwaysOn got a personal invitation from Yahoo founder Jerry Yang &quot;to see and give us feedback on our new social media product, y!360.&quot; We were happy to oblige and dutifully showed up, joining a conference room full of hard-core bloggers and new, new media types. The geeks gave Yahoo 360 an overwhelming thumbs down, with comments like, &quot;So the only services I can use within this new network are Yahoo services? What if I don&#39;t use Yahoo IM?&quot; In essence, the Yahoo team was booed for being &quot;closed web,&quot; and we heartily agreed. With Yahoo 360, Yahoo continues building its own &quot;walled garden&quot; to control its 135 million customersÂan accusation also hurled at AOL in the early 1990s, before AOL migrated its private network service onto the web. As theÂ  Economist recently noted, &quot;Yahoo, in short, has old media plans for the new-media era.&quot;The irony to our view here is, of course, that today&#39;s AO Network is also a &quot;closed web.&quot; In the end, Mr. Yang&#39;s thoughtful invitation and our ensuing disappointment in his new service led to the assignment of this article. It also confirmed our existing plan to completely revamp the AO Network around open standards. To tie it all together, we recruited the chief architect of our new site, the notorious Marc Canter, to pen this piece. We look forward to our reader feedback.Breaking the Web Wide Open!By Marc CanterFor decades, &quot;walled gardens&quot; of proprietary standards and content have been the strategy of dominant players in mainframe computer software, wireless telecommunications services, and the World Wide WebÂit was their successful lock-in strategy of keeping their customers theirs. But like it or not, those walls are tumbling down. Open web standards are being adopted so widely, with such value and impact, that the web giantsÂAmazon, AOL, eBay, Google, Microsoft, and YahooÂare facing the difficult decision of opening up to what they don&#39;t control.The online world is evolving into a new open web (sometimes called the Web 2.0), which is all about being personalized and customized for each user. Not only open source software, but open standardsÂ are becoming an essential component. Many of the web giants have been using open source software for years. Most of them use at least parts of the LAMP (Linux, Apache, MySQL, Perl/Python/PHP) stack, even if they aren&#39;t well-known for giving back to the open source community. For these incumbents that grew big on proprietary web services, the methods, practices, and applications of open source software development are difficult to fully adopt. And the next open source movementsÂwhich will be as much about open standards as about codeÂwill be a lot harder for the incumbents to exploit.While the incumbents use cheap open source software to run their back-ends systems, their business models largely depend on proprietary software and algorithms. But our view a new slew of open software, open protocols, and open standards will confront the incumbents with the classic Innovator&#39;s Dilemma.Â  Should they adopt these tools and standards, painfully cannibalizing their existing revenue for a new unproven concept, or should they stick with their currently lucrative model with the risk that eventually a bunch of upstarts eat their lunch? Credit should go to several of the web giants who have been making efforts to &quot;open up.&quot; Google, Yahoo, eBay, and Amazon all have Open APIs (Application Programming Interfaces) built into their data and systems. Any software developer can access and use them for whatever creative purposes they wish. This means that the API provider becomes an open platform for everyone to use and build on top of. This notion has expanded like wildfire throughout the blogosphere, so nowadays, Open APIs are pretty much required.Other incumbents also have open strategies. AOL has got the RSS religion, providing a feedreader and RSS search in order to escape the &quot;walled garden of content&quot; stigma. Apple now incorporates podcasts, the &quot;personal radio shows&quot; that are latest rage in audio narrowcasting, into iTunes. Even Microsoft is supporting open standards, for example by endorsing SIP (Session Initiation Protocol) for internet telephony and conferencing over Skype&#39;s proprietary format or one of its own devising.But new open standards and protocols are in use, under construction, or being proposed every day, pushing the envelope of where we are right now. Many of these standards are coming from startup companies and small groups of developers, not from the giants. Together with the Open APIs, those new standards will contribute to a new, open infrastructure. Tens of thousands of developers will use and improve this open infrastructure to create new kinds of web-based applications and services, to offer web users a highly personalized online experience.A Brief History of OpennessAt this point, I have to admit that I am not just a passive observer, full-time journalist or &quot;just some blogger&quot;Âbut an active evangelist and developer of these standards. It&#39;s the vision of &quot;open infrastructure&quot; that&#39;s driving my company and the reason why I&#39;m writing this article. This article will give you some of the background behind on these standards, and what the evolution of the next generation of open standards will look like.Starting back in the 1980s, establishing a software standard was a key strategy for any software company. My former company, MacroMind (which became Macromedia), achieved this goal early on with Director. As Director evolved into Flash, the world saw that other companies besides Microsoft, Adobe, and Apple could establish true cross-platform, independent media standards.Then Tim Berners-Lee and Marc Andreessen came along, and changed the rules of the software business and of entrepreneurialism. No matter how entrenched and &quot;standardized&quot; software was, the rug could still get pulled out from under it. Netscape did it to Microsoft, and then Microsoft did it backÂ  to Netscape. The web evolved, and lots of standards evolved with it. The leading open source standards (such as the LAMP stack) became widely used alternatives to proprietary closed-source offerings. Open standards are more than just technology. Open standards mean sharing, empowering, and community support. Someone floats a new idea (or meme) and the community runs with it â with each person making their own contributions to the standard â evolving it without a moment&#39;s hesitation about &quot;giving away their intellectual property.&quot;One good example of this was Dave Sifry, who built the Technorati blog-tracking technology inspired by the Blogging Ecosystem, a weekend project by young hacker Phil Pearson. Dave liked what he saw and he ran with itÂturning Technorati into what it is today.Dave Winer has contributed enormously to this area of open standards. He defined and personally created several open standards and protocolsÂsuch as RSS, OPML, and XML-RPC. Dave has also helped build the blogosphere through his enthusiasm and passion.By 2003, hundreds of programmers were working on creating and establishing new standards for almost everything. The best of these new standards have evolved into compelling web services platforms â such as del.icio.us, Webjay, or Flickr. Some have even spun off formal standards â like XSPF (a standard for playlists) or instant messaging standard XMPP (also known as Jabber).Today&#39;s Open APIs are complemented by standardized SchemasÂthe structure of the data itself and its associated meta-data. Take for example a podcasting feed. It consists of: a) the radio show itself, b) information on who is on the show, what the show is about and how long the show is (the meta-data) and also c) API calls to retrieve a show (a single feed item) and play it from a specified server. The combination of Open APIs, standardized schemas for handling meta-data, and an industry which agrees on these standards are breaking the web wide open right now. So what new open standards should the web incumbentsÂand youÂbe watching? Keep an eye on the following developments:IdentityAttentionOpen MediaMicrocontent PublishingOpen Social NetworksTagsPinging RoutingOpen CommunicationsDevice Management and Control1. IdentityRight now, you don&#39;t really control your own online identity. At the core of just about every online piece of software is a membership system. Some systems allow you to browse a site anonymouslyÂbut unless you register with the site you can&#39;t do things like search for an article, post a comment, buy something, or review it. The problem is that each and every site has its own membership system. So you constantly have to register with new systems, which cannot share dataÂeven you&#39;d want them to. By establishing a &quot;single sign-on&quot; standard, disparate sites can allow users to freely move from site to site, and let them control the movement of their personal profile data, as well as any other data they&#39;ve created. With Passport, Microsoft unsuccessfully attempted to force its proprietary standard on the industry. Instead, a world is evolving where most people assume that users want to control their own data, whether that data is their profile, their blog posts and photos, or some collection of their past interactions, purchases, and recommendations. As long as users can control their digital identity, any kind of service or interaction can be layered on top of it.Identity 2.0 is all about users controlling their own profile data and becoming their own agents. This way the users themselves, rather than other intermediaries, will profit from their ID info. Once developers start offering single sign-on to their users, and users have trusted places to store their dataÂwhich respect the limits and provide access controls over that data, users will be able to access personalized services which will understand and use their personal data.Identity 2.0 may seem like some geeky, visionary future standard that isn&#39;t defined yet, but by putting each user&#39;s digital identity at the core of all their online experiences, Identity 2.0 is becoming the cornerstone of the new open web. The Initiatives:Right now, Identity 2.0 is under construction through various efforts from Microsoft (the &quot;InfoCard&quot; component built into the Vista operating system and its &quot;Identity Metasystem&quot;), Sxip Identity, Identity Commons, Liberty Alliance, LID (NetMesh&#39;s Lightweight ID), and SixApart&#39;s OpenID.More Movers and Shakers:Identity Commons and Kaliya Hamlin, Sxip Identity and Dick Hardt, the Identity Gang and Doc Searls, Microsoft&#39;s Kim Cameron, Craig Burton, Phil Windley, and Brad Fitzpatrick, to name a few.2. AttentionHow many readers know what their online attention is worth? If you don&#39;t, Google and Yahoo doÂthey make their living off our attention. They know what we&#39;re searching for, happily turn it into a keyword, and sell that keyword to advertisers. They make money off our attention. We don&#39;t. Technorati and friends proposed an attention standard, Attention.xml, designed to &quot;help you keep track of what you&#39;ve read, what you&#39;re spending time on, and what you should be paying attention to.&quot; AttentionTrust is an effort by Steve Gillmor and Seth Goldstein to standardize on how captured end-user performance, browsing, and interest data are used. Blogger Peter Caputa gives a good summary of AttentionTrust: &quot;As we use the web, we reveal lots of information about ourselves by what we pay attention to. Imagine if all of that information could be stored in a nice neat little xml file. And when we travel around the web, we can optionally share it with websites or other people. We can make them pay for it, lease it ... we get to decide who has access to it, how long they have access to it, and what we want in return. And they have to tell us what they are going to do with our Attention data.&quot;So when you give your attention to sites that adhere to the AttentionTrust, your attention rights (you own your attention, you can move your attention, you can pay attention and be paid for it,Â  and you can see how your attention is used) are guaranteed. Attention data is crucial to the future of the open web, and Steve and Seth are making sure that no one entity or oligopoly controls it. Movers and Shakers:Steve Gillmor, Seth Goldstein, Dave Sifry and the other Attention.xml folks. 3. Open MediaProprietary media standardsÂFlash, Windows Media, and QuickTime, to name a few Âhelped liven up the web. But they are proprietary standards that try to keep us locked in, and they weren&#39;t created from scratch to handle today&#39;s online content. That&#39;s why, for many of us, an Open Media standard has been a holy grail. Yahoo&#39;s new Media RSS standard brings us one step closer to achieving open media, as do Ogg Vorbis audio codecs, XSPF playlists, or MusicBrainz. And several sites offer digital creators not only a place to store their content, but also to sell it. Media RSS (being developed by Yahoo with help from the community) extends RSS and combines it with &quot;RSS enclosures&quot; Âadds metadata to any media itemÂto create a comprehensive solution for media &quot;narrowcasters.&quot; To gain acceptance for Media RSS, Yahoo knows it has to work with the community. As an active member of this community, I can tell you that we&#39;ll create Media RSS equivalents for rdf (an alternative subscription format) and Atom (yet anotherÂ  subscription format), so no one will be able to complain that Yahoo is picking sides in format wars.When Yahoo announced the purchase of Flickr, Yahoo founder Jerry Yang insinuated that Yahoo is acquiring &quot;open DNA&quot; to turn Yahoo into an open standards player. Yahoo is showing what happens when you take a multi-billion dollar company and make openness one of its core valuesÂso Google, beware, even if Google does have more research fellows and Ph.D.s. The open media landscape is far and wide, reaching from game machine hacks and mobile phone downloads to PC-driven bookmarklets, players, and editors, and it includes many other standardization efforts. XSPF is an open standard for playlists, and MusicBrainz is an alternative to the proprietary (and originally effectively stolen) database that Gracenote licenses. Ourmedia.org is a community front-end to Brewster Kahle&#39;s Internet Archive. Brewster has promised free bandwidth and free storage forever to any content creators who choose to share their content via the Internet Archive. Ourmedia.org is providing an easy-to-use interface and community to get content in and out of the Internet Archive, giving ourmedia.org users the ability to share their media anywhere they wish, without being locked into a particular service or tool. Ourmedia plans to offer open APIs and an open media registry that interconnects other open media repositories into a DNS-like registry (just like the www domain system), so folks can browse and discover open content across many open media services. Systems like Brightcove and Odeo support the concept of an open registry, and hope to work with digital creators to sell their work to fulfill the financial aspect of the &quot;Long Tail.&quot;More Movers and Shakers:Creative Commons, the Open Media Network, Jay Dedman, Ryanne Hodson, Michael Verdi, Eli Chapman, Kenyatta Cheese, Doug Kaye, Brad Horowitz, Lucas Gonze, Robert Kaye, Christopher Allen, Brewster Kahle, JD Lasica, and indeed, Marc Canter, among others.4. Microcontent PublishingUnstructured content is cheap to create, but hard to search through. Structured content is expensive to create, but easy to search. Microformats resolve the dilemma with simple structures that are cheap to use and easy to search.The first kind of widely adopted microcontent is blogging. Every post is an encapsulated idea, addressable via a URL called a permalink. You can syndicate or subscribe to this microcontent using RSS or an RSS equivalent, and news or blog aggregators can then display these feeds in a convenient readable fashion. But a blog post is just a block of unstructured textânot a bad thing, but just a first step for microcontent. When it comes tostructuredÂ data, such as personal identity profiles, product reviews, or calendar-type event data, RSS was not designed to maintain the integrity of the structures. Right now, blogging doesn&#39;t have the underlying structure necessary for full-fledged microcontent publishing. But that will change. Think of local information services (such as movie listings, event guides, or restaurant reviews) that any college kid can access and use in her weekend programming project to create new services and tools.Today&#39;s blogging tools will evolve into microcontent publishing systems, and will help spread the notion of structured data across the blogosphere. New ways to store, represent and produce microcontent will create new standards, such as Structured Blogging and Microformats. Microformats differ from RSS feeds in that you can&#39;t subscribe to them. Instead, Microformats are embedded into webpages and discovered by search engines like Google or Technorati. Microformats are creating common definitions for &quot;What is a review or event? What are the specific fields in the data structure?&quot; They can also specify what we can do with all this information.OPML (Outline Processor Markup Language) is a hierarchical file format for storing microcontent and structured data. It was developed by Dave Winer of RSS and podcast fame.Events are one popular type of microcontent. OpenEvents is already working to create shared databases of standardized events, which would get used by a new generation of event portalsâsuch as Eventful/EVDB, Upcoming.org, and WhizSpark. The idea of OpenEvents is that event-oriented systems and services can work together to establish shared events databases (and associated APIs) that any developer could then use to create and offer their own new service or application. OpenReviews is still in the conceptual stage, but it would make it possible to provide open alternatives to closed systems like Epinions, and establish a shared database of local and global reviews. Its shared open servers would be filled with all sorts of reviews for anyone to access. Why is this important? Because I predict that in the future, 10 times more people will be writing reviews than maintaining their own blog. The list of possible microcontent standards goes on: OpenJobpostings, OpenRecipes, and even OpenLists. Microsoft recently revealed that it has been working on an important new kind of microcontent: Listsâso OpenLists will attempt to establish standards for the kindÂ of lists we all use, such as lists of Links, lists of To Do Items, lists of People, Wish Lists, etc.Movers and Shakers:Tantek Ãelik and Kevin Marks of Technorati, Danny Ayers, Eric Meyer, Matt Mullenweg, Rohit Khare, Adam Rifkin, Arnaud Leene, Seb Paquet, Alf Eaton, Phil Pearson, Joe Reger, Bob Wyman among others.5. Open Social NetworksI&#39;ll never forget the first time I met Jonathan Abrams, the founder of Friendster. He was arrogant and brash and he claimed he &quot;owned&quot;Â  all his users, and that he was going to monetize them and make a fortune off them. This attitude robbed Friendster of its momentum, letting MySpace, Facebook, and other social networks take Friendster&#39;s place.Jonathan&#39;s notion of social networks as a way to control users is typical of the Web 1.0 business model and its attitude towards users in general. Social networks have become one of the battlegrounds between old and new ways of thinking. Open standards for Social Networking will define those sides very clearly. Since meeting Jonathan, I have been working towards finding and establishing open standards for social networks. Instead of closed, centralized social networks with 10 million people in them, the goal is making it possible to have 10 million social networks that each have 10 people in them.FOAF (which stands for Friend Of A Friend, and describes people and relationships in a way that computers can parse) is a schema to represent not only your personal profile&#39;s meta-data, but your social network as well. Thousands of researchers use the FOAF schema in their &quot;Semantic Web&quot; projects to connect people in all sorts of new ways. XFN is a microformat standard for representing your social network, while vCard (long familiar to users of contact manager programs like Outlook) is a microformat that contains your profile information. Microformats are baked into any xHTML webpage, which means thatanyÂ blog, social network page, or any webpage in general can &quot;contain&quot; your social network in itÂand be used byanyÂ compatible tool, service or application. PeopleAggregator is an earlier project now being integrated into open content management framework Drupal. The PeopleAggregator APIs will make it possible to establish relationships, send messages, create or join groups, and post between different social networks. (Sneak preview: this technology will be available in the upcoming GoingOn Network.) All of these open social networking standards mean that inter-connected social networks will form a mesh that will parallel the blogosphere. This vibrant, distributed, decentralized world will be driven by open standards: personalized online experiences are what the new open web will be all aboutÂand what could be more personalized than people&#39;s networks?Movers and Shakers:Eric Sigler, Joel De Gan, Chris Schmidt, Julian Bond, Paul Martino, Mary Hodder, Drummond Reed, Dan Brickley, Randy Farmer, and Kaliya Hamlin, to name a few.6. TagsNowadays, no self-respecting tool or service can ship without tags. Tags are keywords or phrases attached to photos, blog posts, URLs, or even video clips. These user- and creator-generated tags are an open alternative to what used to be the domain of librarians and information scientists: categorizing information and content using taxonomies. Tags are instead creating &quot;folksonomies.&quot;The recently proposed OpenTags concept would be an open, community-owned version of the popular Technorati Tags service. It would aggregate the usage of tags across a wide range of services, sites, and content tools. In addition to Technorati&#39;s current tag features, OpenTags would let groups of people share their tags in &quot;TagClouds.&quot; Open tagging is likely to include some of the open identity features discussed above, to create a tag system that is resilient to spam, and yet trustable across sites all over the web.OpenTags owes a debt to earlier versions of shared tagging systems, which include Topic Exchange and something called the k-collectorÂa knowledge management tag aggregatorÂfrom Italian company eVectors. Movers &amp; Shakers:Phil Pearson, Matt Mower , Paolo Valdemarin, and Mary Hodder and Drummond Reed again, among others.7. PingingWebsites used to be mostly static. Search engines that crawled (or &quot;spidered&quot;) them every so often did a good enough job to show reasonably current versions of your cousin&#39;s homepage or even TimeÂ magazine&#39;s weekly headlines. But when blogging took off, it became hard for search engines to keep up. (Google has only just managed to offer blog-search functionality, despite buying Blogger back in early 2003.)To know what was new in the blogosphere, users couldn&#39;t depend on services that spidered webpages once in a while. The solution: a way for blogs themselves to automatically notify blog-tracking sites that they&#39;d been updated. Weblogs.com was the first blog &quot;ping service&quot;: it displayed the name of a blog whenever that blog was updated. Pinging sites helped the blogosphere grow, and more tools, services, and portals started using pinging in new and different ways. Dozens of pinging services and sitesÂmost of which can&#39;t talk to each otherÂsprang up. Matt Mullenweg (the creator of open source blogging software WordPress) decided that a one-stop service for pinging was needed. He created Ping-o-MaticÂwhich aggregates ping services and simplifies the pinging process for bloggers and tool developers. With Ping-o-Matic, any developer can alert all of the industry&#39;s blogging tools and tracking sites at once. This new kind of open standard, with shared infrastructure, is a critical to the scalability of Web 2.0 services.As Matt said:There are a number of services designed specifically for tracking and connecting blogs. However it would be expensive for all the services to crawl all the blogs in the world all the time. By sending a small ping to each service you let them know you&#39;ve updated so they can come check you out. They get the freshest data possible, you don&#39;t get a thousand robots spidering your site all the time. Everybody wins.Movers and Shakers:Matt Mullenweg, Jim Winstead, Dave Winer8. RoutingBloggers used to have to manually enter the links and content snippets of blog posts or news items they wanted to blog. Today, some RSS aggregators can send a specified post directly into an associated blogging tool: as bloggers browse through the feeds they subscribe to, they can easily specify and send any post they wish to &quot;reblog&quot; from their news aggregator or feed reader into their blogging tool. (This is usually referred to as &quot;BlogThis.&quot;) As structured blogging comes into its own (see the section on Microcontent Publishing), it will be increasingly important to maintain the structural integrity of these pieces of microcontent when reblogging them. Promising standard RedirectThis will combine a &quot;BlogThis&quot;-like capability while maintaining the integrity of the microcontent. RedirectThis will let bloggers and content developers attach a simple &quot;PostThis&quot; button to their posts. Clicking on that button will send that post to the reader/blogger&#39;s favorite blogging tool. This favorite tool is specified at the RedirectThis web service, where users register their blogging tool of choice. RedirectThis also helps maintain the integrity and structure of microcontentÂthen it&#39;s just up to the user to prefer a blogging tool that also attains that lofty goal of microcontent integrity. OutputThis is another nascent web services standard, to let bloggers specify what &quot;destinations&quot; they&#39;d like to have as options in their blogging tool. As new destinations are added to the service, more checkboxes would get added to their blogging toolÂallowing them to route their published microcontent to additional destinations.Movers and Shakers:Michael Migurski, Lucas Gonze9. Open CommunicationsLikely, you&#39;ve experienced the joys of finding friends on AIM or Yahoo Messenger, or the convenience of Skyping with someone overseas. Not that you&#39;re about to throw away your mobile phone or BlackBerry, but for many, also having access to Instant Messaging (IM) and Voice over IP (VoIP) is crucial. IM and VoIP are mainstream technologies that already enjoy the benefits of open standards. Entire industries are bornÂright this secondÂbased around these open standards. Jabber has been an open IM technology for yearsÂin fact, as XMPP, it was officially dubbed a standard by the IETF. Although becoming an official IETF standard is usually the kiss of death, Jabber looks like it&#39;ll be around for a while, as entire generations of collaborative, work-group applications and services have been built on top of its messaging protocol. For VoIP, Skype is clearly the leading standard todayÂthough one could argue just how &quot;open&quot; it is (and defenders of the IETF&#39;s SIP standard often do). But it is free and user-friendly, so there won&#39;t be much argument from usersÂ  about it being insufficiently open. Yet there may be a cloud on Skype&#39;s horizon: web behemoth Google recently released a beta of Google Talk, an IM client committed to open standards. It currently supports XMPP, and will support SIP for VoIP calls.Movers and Shakers:Jeremie Miller, Henning Schulzrinne, Jon Peterson, Jeff Pulver10. Device Management and ControlTo access online content, we&#39;re using more and more devices. BlackBerrys, iPods, Treos, you name it. As the web evolves, more and more different devices will have to communicate with each other to give us the content we want when and where we want it. No-one wants to be dependent on one vendor anymoreÂlike, say, SonyÂfor their laptop, phone, MP3 player, PDA, and digital camera, so that it all works together. We need fully interoperable devices, and the standards to make that work. And to fully make use of how content is moving online content and innovative web services, those standards need to be open.MIDI (musical instrument digital interface), one of the very first open standards in music, connected disparate vendors&#39; instruments, post-production equipment, and recording devices. But MIDI is limited, and MIDI II has been very slow to arrive. Now a new standard for controlling musical devices has emerged: OSC (Open SoundControl). This protocol is optimized for modern networking technology and inter-connects music, video and controller devices with &quot;other multimedia devices.&quot; OSC is used by a wide range of developers, and is being taken up in the mainstream MIDI marketplace.Another open-standards-based device management technology is ZigBee, for building wireless intelligence and network monitoring into all kinds of devices. ZigBee is supported by many networking, consumer electronics, and mobile device companies.Â  Â  Â  Â· Â· Â· Â· Â· Â· Â  Â  The Change to OpennessThe rise of open source software and its &quot;architecture of participation&quot; are completely shaking up the old proprietary-web-services-and-standards approach. Sun MicrosystemsÂwhose proprietary Java standard helped define the Web 1.0Âis opening its Solaris OS and has even announced the apparent paradox of an open-source Digital Rights Management system.Today&#39;s incumbents will have to adapt to the new openness of the Web 2.0. If they stick to their proprietary standards, code, and content, they&#39;ll become the new walled gardensÂplaces users visit briefly to retrieve data and content from enclosed data silos, but not where users &quot;live.&quot; The incumbents&#39; revenue models will have to change. Instead of &quot;owning&quot; their users, users will know they own themselves, and will expect a return on their valuable identity and attention. Instead of being locked into incompatible media formats, users will expect easy access to digital content across many platforms. Yesterday&#39;s web giants and tomorrow&#39;s users will need to find a mutually beneficial new balanceÂbetween open and proprietary, developer and user, hierarchical and horizontal, owned and shared, and compatible and closed. Marc Canter is an active evangelist and developer of open standards. Early in his career, Marc founded MacroMind, which became Macromedia. These days, he is CEO of Broadband Mechanics, a founding member of the Identity Gang and of ourmedia.org. Broadband Mechanics is currently developing the GoingOn Network (with the AlwaysOn Network), as well as an open platform for social networking called the PeopleAggregator.A version of the above post appears in the Fall 2005 issue of AlwaysOn&#39;s quarterly print blogozine, and ran as a four-part series on the AlwaysOn Network website.(Via Marc&#39;s Voice.)</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p><a href="http://marc.blogs.it/">Marc Canter</a>&#39;s <a href="http://marc.blogs.it/archives/2005/10/breaking_the_we.html">Breaking the Web Wide Open! </a> article is something I found pretty late (by my normal discovery standards). This was partly due to the pre- and post- Web 2.0 event noise levels that have dumped the description of an important industry inflection into the &quot;Bozo Bin&quot; of many. Personally, I think we shouldn&#39;t confuse the Web 2.0 traditional-pitch-fest conference with an attempt to identify an important industry inflection).</p><p> Anyway, Marc&#39;s article is a very refreshing read because it provides a really good insight into the general landscape of a rapidly evolving Web alongside genuine appreciation of our broader timeless pursuit of &quot;Openness&quot;. </p><p>To really help this document provide additional value have scrapped the content of the original post and dumped it below so that we can appreciate the value of the links embedded within the article (note: thanks to Virtuoso I only had to paste the content into my blog, the extraction to my <a href="http://www.openlinksw.com/blog/~kidehen/index.vspx?page=linkblog">Linkblog</a> and <a href="http://www.openlinksw.com/blog/~kidehen/index.vspx?page=summary">Blog Summary</a> Pages are simply features of my <a href="http://www.openlinksw.com/virtuos">Virtuoso </a>based Blog Engine):</p><blockquote><h3 class="hed2" style="padding-bottom: 10px">Breaking the Web Wide Open! (complete story)</h3><p>Even the web giants like AOL, Google, MSN, and Yahoo need to observe these open standards, or they&#39;ll risk becoming the &quot;walled gardens&quot; of the new web and be coolio no more.</p><p class="byline"><b><a href="http://community.alwayson-network.com/cgi-bin/WebObjects/AlwaysOn.woa/wa/display?id=9254:Person">Marc Canter</a></b> [<a href="http://community.alwayson-network.com/cgi-bin/WebObjects/AlwaysOn.woa/wa/display?id=9254:Person"><b>Broadband Mechanics, Inc.</b></a>] | POSTED: 09.26.05 @12:00</p><table width="100%" border="0" cellspacing="0" cellpadding="0"><tr><td valign="TOP" class="copy1"><img src="http://community.alwayson-network.com/ao/images/thumb/19433429363e7cd6b1ecfb7.jpg" align="LEFT" border="0" width="80" style="margin: 0px 10px 5px 0px" alt="" /><i><b>Editorial Note:</b> Several months ago, AlwaysOn got a personal invitation from Yahoo founder Jerry Yang &quot;to see and give us feedback on our new social media product, y!360.&quot; We were happy to oblige and dutifully showed up, joining a conference room full of hard-core bloggers and new, new media types. The geeks gave Yahoo 360 an overwhelming thumbs down, with comments like, &quot;So the only services I can use within this new network are Yahoo services? What if I don&#39;t use Yahoo IM?&quot; In essence, the Yahoo team was booed for being &quot;closed web,&quot; and we heartily agreed. With Yahoo 360, Yahoo continues building its own &quot;walled garden&quot; to control its 135 million customersÂan accusation also hurled at AOL in the early 1990s, before AOL migrated its private network service onto the web. As the</i>Â  <a href="http://bernardmoon.blogspot.com/2005/08/yahoos-personality-crisis.html" target="_blank">Economist<i> recently noted</i></a>, &quot;Yahoo, in short, has old media plans for the new-media era.&quot;<br /><br />The irony to our view here is, of course, that today&#39;s AO Network is also a &quot;closed web.&quot; In the end, Mr. Yang&#39;s thoughtful invitation and our ensuing disappointment in his new service led to the assignment of this article. It also confirmed our existing plan to completely revamp the AO Network around open standards. To tie it all together, we recruited the chief architect of our new site, <a href="http://www.corante.com/amateur/articles/20030211-3564.html" target="_blank">the notorious Marc Canter</a>, to pen this piece. We look forward to our reader feedback.<br /><br /><b>Breaking the Web Wide Open!</b><br />By Marc Canter<br /><br />For decades, &quot;walled gardens&quot; of proprietary standards and content have been the strategy of dominant players in mainframe computer software, wireless telecommunications services, and the World Wide WebÂit was their successful lock-in strategy of keeping their customers theirs. But like it or not, those walls are tumbling down. Open web standards are being adopted so widely, with such value and impact, that the web giantsÂAmazon, AOL, eBay, Google, Microsoft, and YahooÂare facing the difficult decision of opening up to what they don&#39;t control.<br /><br />The online world is evolving into a new open web (sometimes called the Web 2.0), which is all about being personalized and customized for each user. Not only open source software, but <i>open standards</i>Â are becoming an essential component. <br /><br />Many of the web giants have been using open source software for years. Most of them use at least parts of the <a href="http://www.onlamp.com/pub/a/onlamp/2001/01/25/lamp.html" target="_blank">LAMP</a> (Linux, Apache, MySQL, Perl/Python/PHP) stack, even if they aren&#39;t well-known for giving back to the open source community. For these incumbents that grew big on proprietary web services, the methods, practices, and applications of open source software development are difficult to fully adopt. And the next open source movementsÂwhich will be as much about open standards as about codeÂwill be a lot harder for the incumbents to exploit.<br /><br />While the incumbents use cheap open source software to run their back-ends systems, their business models largely depend on proprietary software and algorithms. But our view a new slew of open software, open protocols, and open standards will confront the incumbents with the classic <i><a href="http://www.businessweek.com/chapter/christensen.htm" target="_blank">Innovator&#39;s Dilemma</a></i>.Â  Should they adopt these tools and standards, painfully cannibalizing their existing revenue for a new unproven concept, or should they stick with their currently lucrative model with the risk that eventually a bunch of upstarts eat their lunch? <br /><br />Credit should go to several of the web giants who have been making efforts to &quot;open up.&quot; Google, Yahoo, eBay, and Amazon all have Open APIs (Application Programming Interfaces) built into their data and systems. Any software developer can access and use them for whatever creative purposes they wish. This means that the API provider becomes an open platform for everyone to use and build on top of. This notion has expanded like wildfire throughout the blogosphere, so nowadays, Open APIs are pretty much required.<br /><br />Other incumbents also have open strategies. AOL has got the RSS religion, <a href="http://www.siliconbeat.com/entries/2005/07/27/aol_gets_rss_religion_with_my_aoland_feedsters_help.html" target="_blank">providing a feedreader and RSS search</a> in order to escape the &quot;walled garden of content&quot; stigma. <a href="http://www.apple.com/podcasting/" target="_blank">Apple now incorporates podcasts</a>, the &quot;personal radio shows&quot; that are latest rage in audio narrowcasting, into iTunes. Even Microsoft is supporting open standards, for example <a href="http://www.microsoft.com/technet/prodtechnol/winxppro/plan/rtcprot.mspx#EKAA" target="_blank">by endorsing SIP (Session Initiation Protocol) for internet telephony and conferencing</a> over Skype&#39;s proprietary format or one of its own devising.<br /><br />But new open standards and protocols are in use, under construction, or being proposed every day, pushing the envelope of where we are right now. Many of these standards are coming from startup companies and small groups of developers, not from the giants. Together with the Open APIs, those new standards will contribute to a new, open infrastructure. Tens of thousands of developers will use and improve this open infrastructure to create new kinds of web-based applications and services, to offer web users a highly personalized online experience.<br /><br /><b>A Brief History of Openness</b><br /><br />At this point, I have to admit that I am not just a passive observer, full-time journalist or &quot;just some blogger&quot;Âbut an active evangelist and developer of these standards. It&#39;s the vision of &quot;open infrastructure&quot; that&#39;s driving <a href="http://www.broadbandmechanics.com/bbm2005.htm" target="_blank">my company </a> and the reason why I&#39;m writing this article. This article will give you some of the background behind on these standards, and what the evolution of the next generation of open standards will look like.<br /><br />Starting back in the 1980s, establishing a software standard was a key strategy for any software company. My former company, MacroMind (which became Macromedia), achieved this goal early on with Director. As <a href="http://webmonkey.wired.com/webmonkey/99/27/index3a_page6.html?tw=multimedia" target="_blank">Director evolved into Flash</a>, the world saw that other companies besides Microsoft, Adobe, and Apple could establish true cross-platform, independent media standards.<br /><br />Then <a href="http://www.w3.org/People/Berners-Lee/" target="_blank">Tim Berners-Lee</a> and <a href="http://www.ibiblio.org/pioneers/andreesen.html" target="_blank">Marc Andreessen</a> came along, and changed the rules of the software business and of entrepreneurialism. No matter how entrenched and &quot;standardized&quot; software was, the rug could still get pulled out from under it. <a href="http://geekphilosopher.com/MainPage/WebBrowserWars.htm?q=Stocks" target="_blank">Netscape did it to Microsoft, and then Microsoft did it <i>back</i>Â  to Netscape</a>. The web evolved, and lots of standards evolved with it. The leading open source standards (such as the LAMP stack) became widely used alternatives to proprietary closed-source offerings. <br /><br />Open standards are more than just technology. Open standards mean sharing, empowering, and community support. Someone floats a new idea (or <a href="http://en.wikipedia.org/wiki/Meme" target="_blank">meme</a>) and the community runs with it â with each person making their own contributions to the standard â evolving it without a moment&#39;s hesitation about &quot;giving away their intellectual property.&quot;<br /><br />One good example of this was <a href="http://www.sifry.com/alerts/" target="_blank">Dave Sifry</a>, who built the Technorati blog-tracking technology inspired by the <a href="http://www.myelin.co.nz/ecosystem/" target="_blank">Blogging Ecosystem</a>, a weekend project by young hacker <a href="http://marc.blogs.it/archives/2005/07/phil_pearson_jo.html" target="_blank">Phil Pearson</a>. Dave liked what he saw and he ran with itÂturning Technorati into what it is today.<br /><br /><a href="http://en.wikipedia.org/wiki/Dave_Winer" target="_blank">Dave Winer</a> has contributed enormously to this area of open standards. He defined and personally created several open standards and protocolsÂsuch as RSS, OPML, and XML-RPC. Dave has also <a href="http://newhome.weblogs.com/historyOfWeblogs" target="_blank">helped build</a> the blogosphere through his enthusiasm and passion.<br /><br />By 2003, hundreds of programmers were working on creating and establishing new standards for almost everything. The best of these new standards have evolved into compelling web services platforms â such as <a href="http://del.icio.us/" target="_blank">del.icio.us</a>, <a href="http://webjay.org/about" target="_blank">Webjay</a>, or <a href="http://www.flickr.com/photos/tags/ao2005/" target="_blank">Flickr</a>. Some have even spun off formal standards â like XSPF (a standard for playlists) or instant messaging standard XMPP (also known as Jabber).<br /><br />Today&#39;s Open APIs are complemented by standardized SchemasÂthe structure of the data itself and its associated meta-data. Take for example a <a href="http://www.ipodder.org/whatIsPodcasting" target="_blank">podcasting feed</a>. It consists of: a) the radio show itself, b) information on who is on the show, what the show is about and how long the show is (the meta-data) and also c) API calls to retrieve a show (a single feed item) and play it from a specified server. <br /><br />The combination of Open APIs, standardized schemas for handling meta-data, and an industry which agrees on these standards are breaking the web wide open right now. So what new open standards should the web incumbentsÂand youÂbe watching? Keep an eye on the following developments:<br /><br /><b>Identity<br />Attention<br />Open Media<br />Microcontent Publishing<br />Open Social Networks<br />Tags<br />Pinging <br />Routing<br />Open Communications<br />Device Management and Control</b><br /><br /><br /><b>1.	Identity</b><br /><br />Right now, you don&#39;t really control your own online identity. At the core of just about every online piece of software is a membership system. Some systems allow you to browse a site anonymouslyÂbut unless you register with the site you can&#39;t do things like search for an article, post a comment, buy something, or review it. The problem is that each and every site has its own membership system. So you constantly have to register with new systems, which cannot share dataÂeven you&#39;d want them to. By establishing a <a href="http://www.wired.com/news/privacy/0,1848,68329-2,00.html?tw=wn_story_page_next1" target="_blank">&quot;single sign-on&quot; standard</a>, disparate sites can allow users to freely move from site to site, and let them control the movement of their personal profile data, as well as any other data they&#39;ve created. <br /><br />With <a href="http://www.thehindubusinessline.com/2005/01/03/stories/2005010301440200.htm" target="_blank">Passport, Microsoft unsuccessfully attempted</a> to force its proprietary standard on the industry. Instead, a world is evolving where most people assume that users want to control their own data, whether that data is their profile, their blog posts and photos, or some collection of their past interactions, purchases, and recommendations. As long as users can control their digital identity, any kind of service or interaction can be layered on top of it.<br /><br /><a href="http://www.identity20.com/media/OSCON2005/" target="_blank">Identity 2.0</a> is all about users controlling their own profile data and becoming their own agents. This way the users themselves, rather than other intermediaries, will profit from their ID info. Once developers start offering single sign-on to their users, and users have trusted places to store their dataÂwhich respect the limits and provide access controls over that data, users will be able to access personalized services which will understand and use their personal data.<br /><br />Identity 2.0 may seem like some geeky, visionary future standard that isn&#39;t defined yet, but by putting each user&#39;s digital identity at the core of all their online experiences, Identity 2.0 is becoming the cornerstone of the new open web. <br /><br /><b>The Initiatives:</b><br />Right now, Identity 2.0 is under construction through various efforts from Microsoft (the <a href="http://msdn.microsoft.com/webservices/webservices/understanding/advancedwebservices/default.aspx?pull=/library/en-us/dnwebsrv/html/identitymetasystem.asp" target="_blank">&quot;InfoCard&quot; component built into the Vista operating system</a> and its &quot;<a href="http://garage.docsearls.com/node/605" target="_blank">Identity Metasystem</a>&quot;), <a href="http://sxip.com" target="_blank">Sxip Identity</a>, <a href="http://www.identtycommons.net" target="_blank">Identity Commons</a>, <a href="http://www.projectliberty.org/" target="_blank">Liberty Alliance</a>, <a href="http://lid.netmesh.org/" target="_blank">LID</a> (NetMesh&#39;s Lightweight ID), and SixApart&#39;s <a href="http://openid.net/" target="_blank">OpenID</a>.<br /><br /><b>More Movers and Shakers:</b><br />Identity Commons and <a href="http://www.identitywoman.net" target="_blank">Kaliya Hamlin</a>, Sxip Identity and <a href="http://blame.ca/dick/" target="_blank">Dick Hardt</a>, the <a href="http://www.identitygang.org/" target="_blank"> Identity Gang</a> and <a href="http://www.searls.com/dochome.html#Bio" target="_blank">Doc Searls</a>, Microsoft&#39;s <a href="http://www.identityblog.com/" target="_blank">Kim Cameron</a>, <a href="http://www.craigburton.com/" target="_blank">Craig Burton</a>, <a href="http://phil.windley.org/" target="_blank">Phil Windley</a>, and <a href="http://slashdot.org/article.pl?sid=05/07/05/2020221&from=rss" target="_blank">Brad Fitzpatrick</a>, to name a few.<br /><br /><br /><b>2.	Attention</b><br /><br />How many readers know what their online attention is worth? If you don&#39;t, Google and Yahoo doÂthey make their living off our attention. They know what we&#39;re searching for, happily turn it into a keyword, and sell that keyword to advertisers. They make money off our attention. We don&#39;t. <br /><br />Technorati and friends proposed <a href="http://blogs.zdnet.com/Gillmor/index.php?p=74" target="_blank">an attention standard, Attention.xml</a>, designed to &quot;help you keep track of what you&#39;ve read, what you&#39;re spending time on, and what you should be paying attention to.&quot; <a href="http://attentiontrust.org/" target="_blank">AttentionTrust</a> is an effort by <a href="http://blogs.zdnet.com/Gillmor/?p=132" target="_blank">Steve Gillmor</a> and <a href="http://majestic.typepad.com/seth/2005/07/attentiontrusto.html" target="_blank">Seth Goldstein </a>to standardize on how captured end-user performance, browsing, and interest data are used. <br /><br />Blogger <a href="http://worcester.typepad.com/pc4media/2005/07/attentiontrusto_1.html" target="_blank">Peter Caputa gives a good summary</a> of AttentionTrust: <blockquote>&quot;As we use the web, we reveal lots of information about ourselves by what we pay attention to. Imagine if all of that information could be stored in a nice neat little xml file. And when we travel around the web, we can optionally share it with websites or other people. We can make them pay for it, lease it ... we get to decide who has access to it, how long they have access to it, and what we want in return. And they have to tell us what they are going to do with our Attention data.&quot;</blockquote><br />So when you give your attention to sites that adhere to the AttentionTrust, your attention rights (<i>you own your attention, you can move your attention, you can pay attention and be paid for it</i>,Â  and <i>you can see how your attention is used</i>) are guaranteed. Attention data is crucial to the future of the open web, and Steve and Seth are making sure that no one entity or oligopoly controls it. <br /><br /><b>Movers and Shakers:</b><br /><a href="http://blogs.zdnet.com/Gillmor/" target="_blank">Steve Gillmor</a>, <a href="http://majestic.typepad.com/about.html" target="_blank">Seth Goldstein</a>, <a href="http://www.sifry.com/alerts/" target="_blank">Dave Sifry</a> and the <a href="http://developers.technorati.com/wiki/attentionxml" target="_blank">other Attention.xml folks</a>. <br /><br /><br /><b>3.	Open Media</b><br /><br />Proprietary media standardsÂFlash, Windows Media, and QuickTime, to name a few Âhelped liven up the web. But they are proprietary standards that try to keep us locked in, and they weren&#39;t created from scratch to handle today&#39;s online content. That&#39;s why, for many of us, an Open Media standard has been a holy grail. Yahoo&#39;s new Media RSS standard brings us one step closer to achieving open media, as do <a href="http://www.vorbis.com/faq/#what" target="_blank">Ogg Vorbis</a> audio codecs, <a href="http://webjay.org/" target="_blank">XSPF playlists</a>, or <a href="http://musicbrainz.org/" target="_blank">MusicBrainz</a>. And several sites offer digital creators not only a place to store their content, but also to sell it. <br /><br /><a href="http://search.yahoo.com/mrss" target="_blank">Media RSS </a>(being developed by Yahoo with help from the community) extends RSS and combines it with &quot;RSS enclosures&quot; Âadds metadata to any media itemÂto create a comprehensive solution for media &quot;narrowcasters.&quot; To gain acceptance for Media RSS, Yahoo knows it has to work with the community. As an active member of this community, I can tell you that we&#39;ll create Media RSS equivalents for <a href="http://www.xml.com/pub/a/2001/01/24/rdf.html" target="_blank">rdf</a> (an alternative subscription format) and <a href="http://www.atomenabled.org/" target="_blank">Atom</a> (yet <i>another</i>Â  subscription format), so no one will be able to complain that Yahoo is picking sides in format wars.<br /><br />When Yahoo announced the purchase of Flickr, Yahoo founder Jerry Yang insinuated that Yahoo is acquiring &quot;open DNA&quot; to turn Yahoo into <a href="http://www.flickr.com/services/api/" target="_blank">an open standards player</a>. Yahoo is showing what happens when you take a multi-billion dollar company and make openness one of its core valuesÂso Google, beware, even if Google does have more research fellows and Ph.D.s. <br /><br />The open media landscape is far and wide, reaching from game machine hacks and mobile phone downloads to PC-driven bookmarklets, players, and editors, and it includes many other standardization efforts. <a href="http://www.xspf.org/" target="_blank">XSPF</a> is an open standard for playlists, and MusicBrainz is an alternative to the proprietary (and originally effectively stolen) database that <a href="http://en.wikipedia.org/wiki/Gracenote" target="_blank">Gracenote</a> licenses. <br /><br /><a href="http://www.ourmedia.org/" target="_blank">Ourmedia.org</a> is a community front-end to Brewster Kahle&#39;s <a href="http://www.archive.org" target="_blank">Internet Archive</a>. Brewster has promised free bandwidth and free storage forever to any content creators who choose to share their content via the Internet Archive. Ourmedia.org is providing an easy-to-use interface and community to get content in and out of the Internet Archive, giving ourmedia.org users the ability to share their media anywhere they wish, without being locked into a particular service or tool. Ourmedia plans to offer open APIs and an open media registry that interconnects other open media repositories into a DNS-like registry (just like the www domain system), so folks can browse and discover open content across many open media services. Systems like <a href="http://www.brightcove.com/" target="_blank">Brightcove</a> and <a href="http://www.evhead.com/2005/02/how-odeo-happened.asp" target="_blank">Odeo</a> support the concept of an open registry, and hope to work with digital creators to sell their work to fulfill the financial aspect of <a href="http://en.wikipedia.org/wiki/The_Long_Tail" target="_blank">the &quot;Long Tail.&quot;</a><br /><br /><b>More Movers and Shakers:</b><br /><a href="http://creativecommons.org/about/people" target="_blank">Creative Commons</a>, the <a href="http://www.omn.org/" target="_blank">Open Media Network</a>, <a href="http://www.momentshowing.net/about.html" target="_blank">Jay Dedman</a>, <a href="http://ryanedit.blogspot.com/" target="_blank">Ryanne Hodson</a>, <a href="http://michaelverdi.com/index.php" target="_blank">Michael Verdi</a>, <a href="http://www.chapmanlogic.com/blog/aboutEli.html" target="_blank">Eli Chapman</a>, <a href="http://www.unmediated.org/" target="_blank">Kenyatta Cheese</a>, <a href="http://www.itconversations.com/about.html" target="_blank">Doug Kaye</a>, <a href="http://www.wired.com/wired/archive/13.09/yahoo.html" target="_blank">Brad Horowitz</a>, <a href="http://webjay.org/about#colophon" target="_blank">Lucas Gonze</a>, <a href="http://musicbrainz.org/wd/MusicBrainzBio" target="_blank">Robert Kaye</a>,  <a href="http://www.lifewithalacrity.com/" target="_blank">Christopher Allen</a>, <a href="http://en.wikipedia.org/wiki/Brewster_Kahle" target="_blank">Brewster Kahle</a>, <a href="http://www.newmediamusings.com/" target="_blank">JD Lasica</a>, and indeed, <a href="http://www.corante.com/amateur/articles/20030211-3564.html" target="_blank">Marc Canter</a>, among others.<br /><br /><br /><b>4.	Microcontent Publishing</b><br /><br />Unstructured content is cheap to create, but hard to search through. Structured content is expensive to create, but easy to search. <a href="http://developers.technorati.com/wiki/MicroFormats" target="_blank">Microformats</a> resolve the dilemma with simple structures that are cheap to use and easy to search.<br /><br />The first kind of widely adopted microcontent is blogging. Every post is an encapsulated idea, addressable via a URL called a permalink. You can syndicate or subscribe to this microcontent using RSS or an RSS equivalent, and news or blog aggregators can then display these feeds in a convenient readable fashion. But a blog post is just a block of unstructured textânot a bad thing, but just a first step for microcontent. When it comes to<i>structured</i>Â data, such as personal identity profiles, product reviews, or calendar-type event data, RSS was not designed to maintain the integrity of the structures. <br /><br />Right now, blogging doesn&#39;t have the underlying structure necessary for full-fledged microcontent publishing. But that will change. Think of local information services (such as movie listings, event guides, or restaurant reviews) that any college kid can access and use in her weekend programming project to create new services and tools.<br /><br />Today&#39;s blogging tools will evolve into microcontent publishing systems, and will help spread the notion of structured data across the blogosphere. New ways to store, represent and produce microcontent will create new standards, such as <a href="http://structuredblogging.org/" target="_blank">Structured Blogging</a> and <a href="http://microformats.org/" target="_blank">Microformats</a>. Microformats differ from RSS feeds in that you can&#39;t subscribe to them. Instead, Microformats are embedded into webpages and discovered by search engines like Google or Technorati. Microformats are creating common definitions for &quot;What is a review or event? What are the specific fields in the data structure?&quot; They can also specify what we can do with all this information.<a href="http://www.opml.org/spec" target="_blank">OPML (Outline Processor Markup Language)</a> is a hierarchical file format for storing microcontent and structured data. It was developed by <a href="http://en.wikipedia.org/wiki/Dave_Winer" target="_blank">Dave Winer</a> of RSS and podcast fame.<br /><br />Events are one popular type of microcontent. <a href="http://www.openevents.com" target="_blank">OpenEvents</a> is already working to create shared databases of standardized events, which would get used by a new generation of event portalsâsuch as <a href="http://eventful.com/gotevents/" target="_blank">Eventful/EVDB</a>, <a href="http://upcoming.org/" target="_blank">Upcoming.org</a>, and <a href="http://www.whizspark.com/" target="_blank">WhizSpark</a>. The idea of OpenEvents is that event-oriented systems and services can work together to establish shared events databases (and associated APIs) that any developer could then use to create and offer their own new service or application. <a href="http://marc.blogs.it/archives/2005/04/rvw_redux_openr.html" target="_blank">OpenReviews</a> is still in the conceptual stage, but it would make it possible to provide open alternatives to closed systems like Epinions, and establish a shared database of local and global reviews. Its shared open servers would be filled with all sorts of reviews for anyone to access. <br /><br />Why is this important? Because I predict that in the future, 10 times more people will be writing reviews than maintaining their own blog. The list of possible microcontent standards goes on: OpenJobpostings, OpenRecipes, and even OpenLists. Microsoft <a href="http://www.reallysimplesyndication.com/2005/06/22" target="_blank">recently revealed</a> that it has been working on an important new kind of microcontent: Listsâso OpenLists will attempt to establish standards for the <i>kind</i>Â of lists we all use, such as lists of Links, lists of To Do Items, lists of People, Wish Lists, etc.<br /><br /><b>Movers and Shakers:</b><br /><a href="http://tantek.com/log/2005/09.html" target="_blank">Tantek Ãelik</a> and <a href="http://en.wikipedia.org/wiki/Kevin_Marks" target="_blank">Kevin Marks</a> of <a href="http://developers.technorati.com/wiki/MicroFormats" target="_blank">Technorati</a>, <a href="http://dannyayers.com/" target="_blank">Danny Ayers</a>, <a href="http://www.meyerweb.com/" target="_blank">Eric Meyer</a>, <a href="http://photomatt.net/" target="_blank">Matt Mullenweg</a>, <a href="http://zlab.commerce.net/" target="_blank">Rohit Khare</a>, <a href="http://ifindkarma.typepad.com/relax/" target="_blank">Adam Rifkin</a>, <a href="http://www.sivas.com/aleene/" target="_blank">Arnaud Leene</a>, <a href="http://radio.weblogs.com/0110772/" target="_blank">Seb Paquet</a>, <a href="http://hublog.hubmed.org/" target="_blank">Alf Eaton</a>, <a href="http://www.myelin.co.nz/post/" target="_blank">Phil Pearson</a>, <a href="http://www.joereger.com/" target="_blank">Joe Reger</a>, <a href="http://bobwyman.pubsub.com/" target="_blank">Bob Wyman</a> among others.<br /><br /><br /><b>5.	Open Social Networks</b><br /><br />I&#39;ll never forget the first time I met <a href="http://www.jabrams.com/" target="_blank">Jonathan Abrams</a>, the founder of Friendster. He was arrogant and brash and he claimed he &quot;<i>owned</i>&quot;Â  all his users, and that he was going to monetize them and make a fortune off them. This attitude robbed Friendster of its momentum, letting MySpace, Facebook, and other social networks take Friendster&#39;s place.<br /><br />Jonathan&#39;s notion of social networks as a way to control users is typical of the Web 1.0 business model and its attitude towards users in general. Social networks have become one of the battlegrounds between old and new ways of thinking. Open standards for Social Networking will define those sides very clearly. Since meeting Jonathan, I have been working towards finding and establishing open standards for social networks. Instead of closed, centralized social networks with 10 million people in them, the goal is making it possible to have 10 million social networks that each have 10 people in them.<br /><br />FOAF (which stands for Friend Of A Friend, and describes people and relationships in a way that computers can parse) is a schema to represent not only your personal profile&#39;s meta-data, but your social network as well. Thousands of researchers use the <a href="http://www.foaf-project.org/" target="_blank">FOAF schema</a> in their &quot;Semantic Web&quot; projects to connect people in all sorts of new ways. <a href="http://gmpg.org/xfn/" target="_blank">XFN</a> is a microformat standard for representing your social network, while <a href="http://www.imc.org/pdi/" target="_blank">vCard</a> (long familiar to users of contact manager programs like Outlook) is a microformat that contains your profile information. Microformats are baked into any xHTML webpage, which means that<i>any</i>Â blog, social network page, or any webpage in general can &quot;contain&quot; your social network in itÂand be used by<i>any</i>Â compatible tool, service or application. <br /><br />PeopleAggregator is an earlier project now being integrated into <a href="http://drupal.org/" target="_blank">open content management framework Drupal</a>. The <a href="http://www.broadbandmechanics.com/PeopleAggregator/" target="_blank">PeopleAggregator APIs</a> will make it possible to establish relationships, send messages, create or join groups, and post between different social networks. (Sneak preview: this technology will be available in the upcoming GoingOn Network.) <br /><br />All of these open social networking standards mean that inter-connected social networks will form a mesh that will parallel the blogosphere. This vibrant, distributed, decentralized world will be driven by open standards: personalized online experiences are what the new open web will be all aboutÂand what could be more personalized than people&#39;s networks?<br /><br /><b>Movers and Shakers:</b><br /><a href="http://esigler.2nw.net/" target="_blank">Eric Sigler</a>, <a href="http://lucifer.intercosmos.net/index.php?view=about" target="_blank">Joel De Gan</a>, <a href="http://crschmidt.net/" target="_blank">Chris Schmidt</a>, <a href="http://voidstar.com/" target="_blank">Julian Bond</a>, <a href="http://people.tribe.net/paul?_click_path=Application%5Btribe%5D.Person%5Bf2232c95-e123-43a3-b48d-24a5f11f09dc%5D&r=10535" target="_blank">Paul Martino</a>, <a href="http://napsterization.org/stories/archives/000513.html" target="_blank">Mary Hodder</a>, <a href="http://public.2idi.com/=Drummond.Reed" target="_blank">Drummond Reed</a>, <a href="http://danbri.org/" target="_blank">Dan Brickley</a>, <a href="http://360.yahoo.com/profile-9lciejI3aafX1stHPoIRNmkmv4EowQ--" target="_blank">Randy Farmer</a>, and <a href="http://www.kaliyasblogs.net/Iwoman/" target="_blank">Kaliya Hamlin</a>, to name a few.<br /><br /><br /><b>6.	Tags</b><br /><br />Nowadays, no self-respecting tool or service can ship without <a href="http://www.salon.com/tech/feature/2005/02/08/tagging/index_np.html" target="_blank">tags</a>. Tags are keywords or phrases attached to photos, blog posts, URLs, or even video clips. These user- and creator-generated tags are an open alternative to what used to be the domain of librarians and information scientists: categorizing information and content using taxonomies. Tags are instead creating <a href="http://www.wired.com/wired/archive/13.04/view.html?pg=4" target="_blank">&quot;folksonomies.&quot;</a><br /><br />The recently proposed OpenTags concept would be an open, community-owned version of the popular <a href="http://www.technorati.com/tag/" target="_blank">Technorati Tags service</a>. It would aggregate the usage of tags across a wide range of services, sites, and content tools. In addition to Technorati&#39;s current tag features, OpenTags would let groups of people share their tags in &quot;<a href="http://www.zeldman.com/daily/0405d.shtml/" target="_blank">TagClouds</a>.&quot; Open tagging is likely to include some of the open identity features discussed above, to create a tag system that is resilient to spam, and yet trustable across sites all over the web.<br /><br />OpenTags owes a debt to earlier versions of shared tagging systems, which include <a href="http://www.topicexchange.com/" target="_blank">Topic Exchange</a> and something called the <a href="http://www.evectors.com/itkcollector/" target="_blank">k-collector</a>Âa knowledge management tag aggregatorÂfrom Italian company eVectors. <br /><br /><b>Movers &amp; Shakers:</b><br /><a href="http://www.myelin.co.nz/notes/" target="_blank">Phil Pearson</a>, <a href="http://matt.blogs.it/" target="_blank">Matt Mower </a>, <a href="http://paolo.evectors.it/" target="_blank">Paolo Valdemarin</a>, and <a href="http://marc.blogs.it/archives/2005/03/opentopics.html" target="_blank">Mary Hodder</a> and <a href="http://www.equalsdrummond.name/index.php?p=39" target="_blank"> Drummond Reed</a> again, among others.<br /><br /><br /><b>7. Pinging</b><br /><br />Websites used to be mostly static. Search engines that <a href="http://en.wikipedia.org/wiki/Web_crawler" target="_blank">crawled</a> (or &quot;spidered&quot;) them every so often did a good enough job to show reasonably current versions of your cousin&#39;s homepage or even <i>Time</i>Â magazine&#39;s weekly headlines. But when blogging took off, it became hard for search engines to keep up. (Google has only <a href="http://searchenginewatch.com/searchday/article.php/3548411" target="_blank">just managed</a> to offer <a href="http://www.google.com/help/about_blogsearch.html" target="_blank">blog-search functionality</a>, despite <a href="http://www.alwayson-network.com/comments.php?id=325_0_2_0_C" target="_blank">buying Blogger</a> back in early 2003.)<br /><br />To know what was new in the blogosphere, users couldn&#39;t depend on services that spidered webpages once in a while. The solution: a way for blogs themselves to automatically notify blog-tracking sites that they&#39;d been updated. <a href="http://weblogs.com/" target="_blank">Weblogs.com</a> was the first blog &quot;ping service&quot;: it displayed the name of a blog whenever that blog was updated. Pinging sites helped the blogosphere grow, and <a href="http://blo.gs/" target="_blank">more tools</a>, services, and portals started using pinging in new and different ways. Dozens of pinging services and sitesÂmost of which can&#39;t talk to each otherÂsprang up. <br /><br />Matt Mullenweg (the creator of open source blogging software WordPress) decided that a one-stop service for pinging was needed. He created <a href="http://pingomatic.com/" target="_blank">Ping-o-Matic</a>Âwhich aggregates ping services and simplifies the pinging process for bloggers and tool developers. With Ping-o-Matic, any developer can alert all of the industry&#39;s blogging tools and tracking sites at once. This new kind of open standard, with shared infrastructure, is a critical to the scalability of Web 2.0 services.<br /><br />As <a href="http://pingomatic.com/about/" target="_blank">Matt said</a>:<br /><blockquote>There are a number of services designed specifically for tracking and connecting blogs. However it would be expensive for all the services to crawl all the blogs in the world all the time. By sending a small ping to each service you let them know you&#39;ve updated so they can come check you out. They get the freshest data possible, you don&#39;t get a thousand robots spidering your site all the time. Everybody wins.</blockquote><br /><b>Movers and Shakers:</b><br /><a href="http://photomatt.net/about/" target="_blank">Matt Mullenweg</a>, <a href="http://trainedmonkey.com/entry/2251" target="_blank">Jim Winstead</a>, <a href="http://newhome.weblogs.com/faq" target="_blank">Dave Winer</a><br /><br /><br /><b>8. Routing</b><br /><br />Bloggers used to have to manually enter the links and content snippets of blog posts or news items they wanted to blog. Today, some RSS aggregators can send a specified post directly into an associated blogging tool: as bloggers browse through the feeds they subscribe to, they can easily specify and send any post they wish to &quot;<a href="http://www.microsoftmonitor.com/archives/010209.html" target="_blank">reblog</a>&quot; from their news aggregator or feed reader into their blogging tool. (This is usually referred to as &quot;<a href="http://help.blogger.com/bin/answer.py?answer=152&topic=17" target="_blank">BlogThis</a>.&quot;) As structured blogging comes into its own (see the section on Microcontent Publishing), it will be increasingly important to maintain the structural integrity of these pieces of microcontent when reblogging them. <br /><br />Promising standard <a href="http://redirectthis.com/" target="_blank">RedirectThis</a> will combine a &quot;BlogThis&quot;-like capability while maintaining the integrity of the microcontent. RedirectThis will let bloggers and content developers attach a simple &quot;PostThis&quot; button to their posts. Clicking on that button will send that post to the reader/blogger&#39;s favorite <a href="http://ecto.kung-foo.tv/archives/000990.php" target="_blank">blogging tool</a>. This favorite tool is specified at the RedirectThis web service, where users register their blogging tool of choice. RedirectThis also helps maintain the integrity and structure of microcontentÂthen it&#39;s just up to the user to prefer a blogging tool that also attains that lofty goal of microcontent integrity. <br /><br />OutputThis is another nascent web services standard, to let bloggers specify what &quot;destinations&quot; they&#39;d like to have as options in their blogging tool. As new destinations are added to the service, more checkboxes would get added to their blogging toolÂallowing them to route their published microcontent to additional destinations.<br /><br /><b>Movers and Shakers:</b><br /><a href="http://reblog.org/" target="_blank">Michael Migurski</a>, <a href="http://www.gonze.com/about" target="_blank">Lucas Gonze</a><br /><br /><br /><b>9. Open Communications</b><br /><br />Likely, you&#39;ve experienced the joys of finding friends on AIM or Yahoo Messenger, or the convenience of Skyping with someone overseas. Not that you&#39;re about to throw away your mobile phone or BlackBerry, but for many, also having access to Instant Messaging (IM) and Voice over IP (VoIP) is crucial. <br /><br />IM and VoIP are mainstream technologies that already enjoy the benefits of open standards. Entire industries are bornÂright this secondÂbased around these open standards. <a href="http://www.jabber.org/" target="_blank">Jabber</a> has been an open IM technology for yearsÂin fact, <a href="http://www.xmpp.org/history.html" target="_blank">as XMPP</a>, it was officially dubbed a standard by <a href="http://www.ietf.org/overview.html" target="_blank">the IETF</a>. Although becoming an <a href="http://en.wikipedia.org/wiki/IETF" target="_blank">official IETF standard</a> is usually the kiss of death, Jabber looks like it&#39;ll be around for a while, as entire generations of collaborative, work-group applications and services have been built on top of its messaging protocol. For VoIP, <a href="http://skype.com/helloagain.html" target="_blank">Skype</a> is clearly the leading standard todayÂthough one could <a href="http://socialsoftware.weblogsinc.com/entry/1234000923058521/" target="_blank">argue just how &quot;open&quot; it is</a> (and defenders of the IETF&#39;s <a href="http://www.cs.columbia.edu/sip/" target="_blank">SIP standard</a> often do). But it is free and user-friendly, so there won&#39;t be much argument from <i>users</i>Â  about it being insufficiently open. Yet there may be a cloud on Skype&#39;s horizon: web behemoth Google recently released a beta of <a href="http://www.google.com/talk/developer.html" target="_blank">Google Talk, an IM client committed to open standards</a>. It currently <a href="http://radar.oreilly.com/archives/2005/08/google_talk_rel.html" target="_blank">supports XMPP, and will support SIP</a> for VoIP calls.<br /><br /><b>Movers and Shakers:</b><br /><a href="http://www.jabber.org/people/jer.shtml" target="_blank">Jeremie Miller</a>, <a href="http://www.cs.columbia.edu/~hgs/" target="_blank">Henning Schulzrinne</a>, <a href="http://www.von.com/schedule_eos11114704148.html" target="_blank">Jon Peterson</a>, <a href="http://www.pulver.com/jeff/" target="_blank">Jeff Pulver</a><br /><br /><br /><b>10. Device Management and Control</b><br /><br />To access online content, we&#39;re using more and more devices. BlackBerrys, iPods, Treos, you name it. As the web evolves, more and more different devices will have to communicate with each other to give us the content we want when and where we want it. No-one wants to be dependent on one vendor anymoreÂlike, <a href="http://www.alwayson-network.com/comments.php?id=P9409_0_6_0_C" target="_blank">say, Sony</a>Âfor their laptop, phone, MP3 player, PDA, and digital camera, so that it all works together. We need fully interoperable devices, and the standards to make that work. And to fully make use of how content is moving online content and innovative web services, those standards need to be open.<br /><br /><a href="http://en.wikipedia.org/wiki/Midi" target="_blank">MIDI (musical instrument digital interface)</a>, one of the very first open standards in music, connected disparate vendors&#39; instruments, post-production equipment, and recording devices. But MIDI is limited, and <a href="http://www.oreillynet.com/pub/wlg/8015" target="_blank">MIDI II has been very slow to arrive</a>. Now a new standard for controlling musical devices has emerged: <a href="http://www.cnmat.berkeley.edu/OpenSoundControl/" target="_blank">OSC (Open SoundControl)</a>. This protocol is optimized for modern networking technology and inter-connects music, video and controller devices with &quot;other multimedia devices.&quot; OSC is used by a wide range of developers, and is being taken up in the mainstream MIDI marketplace.<br /><br />Another open-standards-based device management technology is <a href="http://www.zigbee.org" target="_blank">ZigBee</a>, for building wireless intelligence and network monitoring into all kinds of devices. ZigBee is supported by many networking, consumer electronics, and mobile device companies.<br /><br /><br />Â  Â  Â  Â· Â· Â· Â· Â· Â· Â  Â  <br /><br /><b>The Change to Openness</b><br /><br />The rise of open source software and its &quot;<a href="http://www.oreillynet.com/pub/a/oreilly/tim/articles/architecture_of_participation.html" target="_blank">architecture of participation</a>&quot; are completely shaking up the old proprietary-web-services-and-standards approach. Sun MicrosystemsÂwhose proprietary Java standard helped define the Web 1.0Âis opening its Solaris OS and has even announced the apparent paradox of an <a href="http://blogs.zdnet.com/open-source/?p=418" target="_blank">open-source Digital Rights Management</a> system.<br /><br />Today&#39;s incumbents will have to adapt to the new openness of the Web 2.0. If they stick to their <a href="http://www.gartner.com/DisplayDocument?doc_cd=131038" target="_blank">proprietary standards</a>, code, and content, they&#39;ll become the new walled gardensÂplaces users visit briefly to retrieve data and content from enclosed data silos, but not where users &quot;live.&quot; The incumbents&#39; revenue models will have to change. Instead of &quot;owning&quot; their users, users will know they own themselves, and will expect a return on their valuable identity and attention. Instead of being locked into incompatible media formats, users will expect easy access to digital content across many platforms. <br /><br />Yesterday&#39;s web giants and tomorrow&#39;s users will need to find a mutually beneficial new balanceÂbetween open and proprietary, developer and user, hierarchical and horizontal, owned and shared, and compatible and closed. <br /><br /><br /><i>Marc Canter is an active evangelist and developer of open standards. Early in his career, Marc founded MacroMind, which became Macromedia. These days, he is CEO of Broadband Mechanics, a founding member of the Identity Gang and of ourmedia.org. Broadband Mechanics is currently developing the <a href="http://www.alwayson-network.com/comments.php?id=11262_0_1_0_C" target="_blank">GoingOn Network</a> (with the AlwaysOn Network), as well as an open platform for social networking called the PeopleAggregator.</i><br /><br />A version of the above post appears in the Fall 2005 issue of AlwaysOn&#39;s quarterly print blogozine, and ran as <a href="http://www.alwayson-network.com/comments.php?id=12063_0_1_0_C" target="_blank">a four-part series</a> on the AlwaysOn Network website.</td></tr></table><br /><p>(Via <a href="http://marc.blogs.it/">Marc&#39;s Voice</a>.)</p></blockquote>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-04-26#810">
  <rss:title>WebDAV, SQLX, and my Weblog</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2005-04-26T03:54:43Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Uche Ogbuji comments in his blog about the use of WebDAV and SQLX in my blog as part of his commentary about Pyblosxom &amp; WebDAV. To provide some clarity about Virtuoso and Blogging I have decided to put out this quick step by guide to the workings of my blog (there is a long overdue technical white paper nearing completion that address this subject in more detail). Here goes: Blog Editing I can use any editor that supports the following Blog Post APIs: - Moveable Type - Meta Weblog - Blogger Typically I use Virtuoso (which has an unreleased WYSIWYG blog post editor), Newzcrawler, ecto, Zempt, or w.bloggar for my posts. If a post is of interest to me, or relevant to our company or customers I tend to perform one of the following tasks: - Generate a post using the &quot;Blog This&quot; feature of my blog editor - Write a new post that was triggered by a previously read post etc. Either way, the posts end up in our company wide blog server that is Virtuoso based (more about this below). The internal blog server automatically categorizes my blog posts, and automagically determines which posts to upstream to other public blogs that I author (e.g http://kidehen.typepad.com ) or co-author (e.g http://www.openlinksw.com/weblogs/uda and http://www.openlinksw.com/weblogs/virtuoso ). I write once and my posts are dispatched conditionally to multiple outlets. RSS/Atom/RDF Aggregation &amp; Reading I discover, subscribe to, and view blog feeds using Newzcrawler (primarily), and from time to time for experimentation and evaluation purposes I use RSS Bandit, FeedDemon, and Bloglines. I am in the process of moving this activity over to Virtuoso completely due to the large number of feeds that I consume on a daily basis (scalability is a bit of a problem with current aggregators). Blog Publishing When you visit my blog you are experiencing the  soon to be released Virtuoso Blog Publishing engine first hand, which is how WebDAV, SQLX, XQuery/XPath, and Free Text etc. come into the mix. Each time I create a post internally, or subscribe to an external feed, the data ends up in Virtuoso&#39;s SQL Engine (this is how we handle some of the obvious scalability challenges associated with large subscription counts). This engine is SQL2000N based, which implies that it can transform SQL to XML on the fly using recent extensions to SQL in the form of SQLX (prior to the emergence of this standard we used the FOR XML SQL syntax extensions for the same result). It also has its own in-built XSLT processor (DB Engine resident), and validating XML parser (with support for XML Schema).  Thus, my RSS/RDF/Atom archives, FOAF, BlogRoll, OPML, and OCS blog syndication gems are all live examples of SQLX documents that leverage Virtuoso&#39;s WebDAV engine for exposure to Blog Clients. Blog Search When you search for blog posts using the basic or advanced search features of my blog, you end up interacting with one of the following methods of querying data hosted in Virtuoso: Free Text Search, XPath, or XQuery. The result sets produced by the search feature uses SQLX to produce subscription gems (RSS/Atom/RDF/ blog home page exists as a result of Virtuoso&#39;s Virtual Domain / Multi-Homing Web Server functionality. The entire site resides in an Object Relational DBMS, and I can take my DB file across Windows, Solaris, Linux, Mac OS X, FreeBSD, AIX, HP-UX, IRIX, and SCO UnixWare without missing a single beat! All I have to do is instantiate my Virtuoso server and my weblog is live.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>Uche Ogbuji <a href="http://copia.ogbuji.net/blog/2005/04/24#Posting_to">comments</a> in his <a href="http://copia.ogbuji.net/blog">blog</a> about the use of WebDAV and <a href="http://www.tbradford.org/2005/02/xml-with-virtuoso-and-sqlx_02.html">SQLX </a>in my blog as part of his commentary about <a href="http://egaumer.pagecache.org/PyBlosxom/pyblosxom-webdav.html">Pyblosxom &amp; WebDAV</a>. To provide some clarity about Virtuoso and Blogging I have decided to put out this quick step by guide to the workings of my blog (there is a long overdue technical white paper nearing completion that address this subject in more detail).</p>
<p>Here goes:</p>
<p><u><strong>Blog Editing</strong></u></p>
<p>I can use any editor that supports the following Blog Post APIs:</p>
<p>- Moveable Type</p>
<p>- Meta Weblog</p>
<p>- Blogger</p>
<p>Typically I use Virtuoso (which has an unreleased&nbsp;WYSIWYG blog post editor), <a href="http://www.newzcrawler.com/">Newzcrawler</a>, <a href="http://ecto.kung-foo.tv/">ecto</a>, <a href="http://zempt.com/">Zempt</a>, or <a href="http://www.wbloggar.com/">w.bloggar</a> for my posts. If a post is of interest to me, or relevant to our company or customers&nbsp;I tend to perform one of the following tasks:</p>
<p>- Generate a post using the "Blog This" feature of my blog editor</p>
<p>-&nbsp;Write a new post that was triggered by a previously read post etc.</p>
<p>Either way, the posts end up in our company wide blog server that is Virtuoso based (more about this below). The internal blog server automatically categorizes my blog posts, and automagically determines which posts to upstream to other public blogs that I author (e.g <a href="http://kidehen.typepad.com/">http://kidehen.typepad.com</a> ) or co-author (e.g <a href="http://www.openlinksw.com/weblogs/uda">http://www.openlinksw.com/weblogs/uda</a> and <a href="http://www.openlinksw.com/weblogs/virtuoso">http://www.openlinksw.com/weblogs/virtuoso</a> ). I write once and my posts are dispatched conditionally to multiple outlets.</p>
<p><strong><u>RSS/Atom/RDF Aggregation &amp; Reading</u></strong></p>
<p>I discover, subscribe to, and&nbsp;view blog feeds using <a href="http://www.newzcrawler.com/">Newzcrawler</a> (primarily), and from time to time for experimentation and evaluation purposes I use <a href="http://www.rssbandit.org/">RSS Bandit</a>,&nbsp;<a href="http://www.bradsoft.com/feeddemon/">FeedDemon</a>, and <a href="http://www.bloglines.com/">Bloglines</a>. I am in the process of moving this activity over to Virtuoso completely due to the large number of feeds that I consume on a daily basis (scalability is a bit of a problem with current aggregators).</p>
<p><u><strong>Blog Publishing</strong></u></p>
<p>When you visit my blog you are experiencing the&nbsp; soon to be released Virtuoso Blog Publishing engine first hand, which is how&nbsp;WebDAV, SQLX, XQuery/XPath, and Free Text etc. come into the mix.</p>
<p>Each time I create a post internally, or subscribe to an external feed, the data ends up in Virtuoso's SQL Engine (this is how we handle some of the obvious scalability challenges associated with large subscription counts). This engine is SQL2000N based, which implies that it can transform SQL to XML on the fly using recent extensions to SQL in the form of SQLX (prior to the emergence of this standard we used the FOR XML SQL syntax extensions for the same result). It also has its own in-built XSLT processor (DB Engine&nbsp;resident), and validating XML parser (with support for XML Schema).&nbsp; Thus, my <a href="http://www.openlinksw.com/blog/~kidehen/gems/">RSS/RDF/Atom archives, FOAF, BlogRoll, OPML, and OCS</a> blog syndication gems are all live examples of SQLX documents that leverage Virtuoso's WebDAV engine for exposure to&nbsp;Blog Clients.</p>
<p><strong><u>Blog Search</u></strong></p>
<p>When you search for blog posts using the basic or <a href="http://www.openlinksw.com/blog/search.vspx?blogid=127">advanced search</a> features of my blog, you end up interacting with one of the following methods of querying data hosted in Virtuoso: Free Text Search, XPath, or XQuery. The <a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&q=virtuoso&type=text&output=html">result sets</a> produced by the search feature uses SQLX to produce subscription gems (<a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&q=virtuoso&type=text&output=xml">RSS</a>/<a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&q=virtuoso&type=text&output=atom">Atom</a>/<a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&q=virtuoso&type=text&output=rdf">RDF</a>/<a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&type=text&kwds=virtuoso&OpenSearch">OpenSearch</a>) and <a href="http://www.openlinksw.com/blog/search.vspx?blogid=127&q=virtuoso&type=text&output=html">URIs</a> that enable dynamic tracking of my posts using your search keywords.</p>
<p>BTW - the <a href="http://www.openlinksw.com/blog/~kidehen">http://www.openlinksw.com/blog/~kidehen</a> blog home page exists as a result of Virtuoso's Virtual Domain / Multi-Homing Web Server functionality. The entire site resides in an Object Relational DBMS, and I can take my DB file across Windows, Solaris, Linux, Mac OS X, FreeBSD, AIX, HP-UX, IRIX, and SCO UnixWare without missing a single beat! All I have to do is instantiate my Virtuoso server and my weblog is live.</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2005-03-26#766">
  <rss:title>Back To The Future: Hypermedia</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2005-03-26T20:24:30Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">If a picture speaks a thousand words, I sometimes wonder how many words we attribute to a multimedia clip? Especially one that is now openly accessible to many who don&#39;t quite understand the high degree of: &quot;Back To The Future&quot; quotient of most of what we see today. The Internet Archive initiative is building up an amazing collection of content that includes this &quot;must watch&quot; movie about the somewhat forgotten hypercard development environment. As I watched the hypercard movie I obtained clear reassurance that my vision of Web 2.0 as critical infrastructure for a future Semantic Web isn&#39;t unfounded. The solution building methodology espoused by hypercard is exactly how Semantic Web applications will be built, and this will be done by orchestrating the componentary of Web 2.0. When watching this clip make the following mental adjustments: Swap hypercard stacks for discrete and/or composite services that have published endpoints exposed by Web 2.0 points of presence Think of information taking the form of XML based content e.g. RSS, Atom, RDF, FOAF, XFN, and other future XML based data contextualization formats; all accessible via URIs When the Apple Mac operating system is mentioned (or infered) think of the Internet (you don&#39;t need Windows, Mac OS, Linux, UNIX etc. to realize the vision, the network provided by the Internet is the Operating System) When the Apple computer is mentioned simply think about a plethora of function specific devices (computers, mobile phones, PDAs etc.) that overtly or covertly provide conduits to the new operating environment (the Internet) As you hear term &quot;whole new body of people that are non programmers contributing there ideas&quot; think about yourself and the increasing ease of participation that&#39;s beginning to take shape in this emerging frontier! As for &quot;Whole Earth Catalog&quot;, think Wikipedia or more recent efforts such as Answers.com. Web 2.0 is a reflection of the web taking its first major step out of the technology stone age (certainly the case relative to the hypercard movie and &quot;pre web&quot; application development in general).  </dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>If a picture speaks a thousand words, I sometimes wonder how many words we attribute to a multimedia clip? Especially one that is now openly accessible to many who don't quite understand the high degree of: "Back To The Future" quotient of most of what we see today.</p>
<p>The Internet&nbsp;Archive initiative is building up an amazing&nbsp;collection of content&nbsp;that includes this <a href="http://www.archive.org/movies/details-db.php?collection=computerchronicles&collectionid=CC501_hypercard">"must watch" movie</a> about the somewhat forgotten <a href="http://en.wikipedia.org/wiki/Hypercard">hypercard</a> development environment.</p>
<p>As I watched the hypercard movie I obtained clear reassurance that my vision of <a href="http://en.wikipedia.org/wiki/Web_2.0">Web 2.0</a> as critical infrastructure for a future Semantic Web isn't unfounded. The solution building methodology espoused by hypercard is exactly how Semantic Web applications will be built, and this will be done by orchestrating&nbsp;the componentary of Web 2.0.</p>
<p>When watching this clip make the following mental adjustments:</p>
<ol>
<li>Swap hypercard stacks for discrete and/or composite services that have published endpoints exposed by Web 2.0 points of presence<br><br></li>
<li>Think of information taking the form of XML based content e.g. RSS, Atom, RDF, FOAF, XFN, and other future&nbsp;XML based data contextualization&nbsp;formats; all accessible via URIs<br><br></li>
<li>When the Apple Mac operating system is mentioned (or infered) think of the Internet&nbsp;(you don't need Windows, Mac OS, Linux,&nbsp;UNIX etc.&nbsp;to realize the vision, the network provided by the Internet&nbsp;is the Operating System)<br><br></li>
<li>When the Apple computer is mentioned simply think about a plethora of function specific devices (computers, mobile phones, PDAs etc.) that overtly or covertly provide conduits to the new operating environment (the Internet)<br><br></li>
<li>As you hear term "whole new body of people that are non programmers contributing there ideas" think about yourself and the increasing ease of participation&nbsp;that's beginning to take shape in this&nbsp;emerging frontier!<br><br></li>
<li>As for "<a href="http://www.wholeearthmag.com/about.html">Whole Earth Catalog", </a>think <a href="http://en.wikipedia.org">Wikipedia</a>&nbsp;or more recent efforts such as <a href="http://www.answers.com">Answers.com</a>.</li></ol>
<p>Web 2.0 is a reflection of the web taking its first major step out of the technology stone age (certainly the case relative to the hypercard movie and "pre web" application development in general). </p>
<p>&nbsp;</p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2004-08-26#611">
  <rss:title>Is Google Web 2.0&#39;s Netscape?</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2004-08-26T21:52:30Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I put this piece together in response to another stimulating post by Dare Obasanjo titled &quot;Is Google the Next Microsoft or the Next Netscape?&quot;. I changed the title of this post to project the fact that Web 2.0 provides the appropriate context (IMHO) for Dare&#39;s point re. &quot;Web Site Stickiness&quot;. Stickiness is a defining characteristic of Web 1.0 . It&#39;s all about eyeballs (site visitors) which implied ultimately that all early Web business models ended up down the advertising route. I always felt that Web 1.0 was akin to having a crowd of people at your reception area seeking a look at your corporate brochures, and then someone realizes that you could start selling AD space in these brochures in response to the growing crowd size and frequency of congregation. The long-term folly of this approach is now obvious, as many organizations forgot their core value propositions (expressed via product offerings) in the process and wandered blindly down the AD model cul-de-sac, and we all know what happened down there.. Web 2.0 is taking shape (the inflection is in its latter stages), and the defining characteristics of Web 2.0 are: Fabric of Executable Endpoints Semantic Content (the RSS/RDF/Atom/FOAF semantic crumbs emerging from the Blogosphere are great examples of things to come re. XQuery queries over HTTP for instance) Migration from the Web Site (defined by static or dynamic HTML page generation) concept, to that of a &quot;Web Point of Presence&quot; (I don&#39;t know if this term will catch on, but the conceptual essence here is factual) that enables an organization to achieve the following: Package/catalog value proposition (product and services) using RSS/RDF/Atom Provide SOAP compliant Executable Endpoints (Web Services) for consuming value proposition (as opposed to being distracted by the AD model) Provide Web Services for consummating contracts associated with core value proposition Identification of internal efficiencies, new products/services that leverage Semantic Content and Web Services, and tangibly exploit: Composite Web Services construction from legacy monolithic application pools Standards based (e.g. BPEL) orchestration and integration of disparate composite services (across the Fabric referred to above) When you factor in all of the above, the real question is whether Google and others are equipped to exploit Web 2.0?  To some degree, is the best answer at the current time as they have commenced the transition from &quot;content only&quot; web site to web platform (via the many Web Services initiatives that expose SOAP and REST interfaces to various services), but there is much more to this journey, and that&#39;s the devil in the &quot;competitive landscape details&quot;. From my obviously biased perspective, I think Virtuoso and Yukon+WinFS provide the server models for driving Web 2.0 points of presence (single server instances that implement multiple protocols). Thus, if Google, Yahoo! et al. aren&#39;t exploiting these or similar products, then they will be vulnerable over the long term to the competitve challenges that a Web 2.0 landscape will present.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<p>I put this piece together in response to another <a href="http://www.25hoursaday.com/weblog/CommentView.aspx?guid=5ab1ca87-b0df-4dd0-99b6-7730955620ab">stimulating post</a> by Dare Obasanjo titled "Is Google the Next Microsoft or the Next Netscape?". I changed the title of this post to project the fact that Web 2.0 provides the appropriate context (IMHO) for Dare's point re. "Web Site Stickiness". </p>
<p>Stickiness is a defining characteristic of Web 1.0 . It's all about eyeballs (site visitors) which implied ultimately that all early Web business models ended up down the advertising route. </p>
<p>I always felt that Web 1.0 was akin to having a crowd of people at your reception area seeking a look at your corporate brochures, and then someone realizes that you could start selling AD space in these brochures in response to the growing crowd size and frequency of congregation. The long-term folly of this approach is now obvious, as many organizations forgot their core value propositions (expressed via product offerings) in the process and wandered blindly down the AD model cul-de-sac, and we all know what happened down there.. </p>
<p>Web 2.0 is taking shape (the inflection is in its latter stages), and the defining characteristics of Web 2.0 are: </p>
<ol>
<li>Fabric of Executable Endpoints <br></li>
<li>Semantic Content (the RSS/RDF/Atom/FOAF semantic crumbs emerging from the Blogosphere are great examples of things to come re. XQuery queries over HTTP for instance) Migration from the Web Site (defined by static or dynamic HTML page generation) concept, to that of a "Web Point of Presence" (I don't know if this term will catch on, but the conceptual essence here is factual) that enables an organization to achieve the following: <br></li>
<ul>
<li>Package/catalog value proposition (product and services) using RSS/RDF/Atom <br></li>
<li>Provide SOAP compliant Executable Endpoints (Web Services) for consuming value proposition (as opposed to being distracted by the AD model) <br></li>
<li>Provide Web Services for consummating contracts associated with core value proposition Identification of internal efficiencies, new products/services that leverage Semantic Content and Web Services, and tangibly exploit: <br></li>
<ul>
<li>Composite Web Services construction from legacy monolithic application pools <br></li>
<li>Standards based (e.g. BPEL) orchestration and integration of disparate composite services (across the Fabric referred to above) </li></ul></ul></ol>
<p>When you factor in all of the above, the real question is whether Google and others are equipped to exploit Web 2.0? &nbsp;To some degree, is the best answer at the current time as&nbsp;they have commenced the transition from&nbsp;"content only" web site&nbsp;to web platform (via the many Web Services initiatives that expose SOAP and REST interfaces to various services), but there is much more to this journey, and that's the devil in the "competitive landscape details". </p>
<p>From my obviously biased perspective, I think <a href="http://virtuoso.openlinksw.com/">Virtuoso</a> and <a href="http://www.midrangeserver.com/two/two042804-story02.html">Yukon+WinFS</a> provide the server models for driving Web 2.0 points of presence (single server instances that&nbsp;implement multiple protocols). Thus,&nbsp;if Google, Yahoo! et al.&nbsp;aren't exploiting these or similar products, then they will be vulnerable over the long term to the competitve&nbsp;challenges that a Web 2.0&nbsp;landscape will present. </p>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-10-02#383">
  <rss:title>RSS: The Best Of All Possible Worlds</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-10-03T02:37:52Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">RSS: The Best Of All Possible Worlds The thing that most surprised me today in the SoftEdge panel on Social Software was the reaction to RSS. I should be clear that I am an RSS true believer. It seems to me that metadata as a byproduct of social software engines (be it blogging or social networking or whatever) is not only enviable, it is inevitable. RSS and FOAF and other yet-to-be-determined social software data protocols will become standards because it simply makes good sense for them to be standardized. Anyone paying attention to the unbelievable development and adoption curve of wireless can appreciate the immense value driven by standards -- and, in particular, standards that are truly standard. So it came as a bit of a shock to me that when I questioned the panelists on the implications of RSS and the Semantic Web, they were less sold on the inevitability of it all. When asked the question of whether the proliferation of RSS and FOAF might make it possible for reader technology to be the next killer application in knowledge management, I got very strong reactions from both Reid Hoffman and Meg Hourihan. Reid stated that he did not believe that RSS was sufficiently robust to provide significant value an any level. Meg followed up with a general indictment of the semantic web, which she views merely as a geek utopia. I will admit that I&#39;m a fan of Candide (particularly at the hands of Bernstein), but I hardly view myself as Panglos. One need look no further than, for example, the tools that Oddpost has incorporated into its web email client to allow an integrated email and blog experience. Better yet, through a relatively simple web service, Oddpost can deliver an RSS feed of a particular Google News search so that you can keep track of keywords that are of interest to you without having to visit Google repeatedly to find out if your company or candidate or favorite band has been mentioned in today&#39;s news. The same is true of watch lists on Technorati. Rather than periodically check to see if someone has linked to your blog, Technorati will do the work for you and deliver the info to your inbox only when there is information to be delivered. These examples are just the tip of the iceberg but the demonstrate the nascent power of RSS and related standards. I&#39;ll have to wait for another panel to have that argument with Reid and Meg. [via VentureBlog]</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<A href="http://www.ventureblog.com/articles/indiv/2003/000192.html">RSS: The Best Of All Possible Worlds</A> 
<P>The thing that most surprised me today in the <A href="http://www.pulver.com/rvc2003/">SoftEdge</A> panel on Social Software was the reaction to RSS. I should be clear that I am an RSS true believer. It seems to me that metadata as a byproduct of social software engines (be it blogging or social networking or whatever) is not only enviable, it is inevitable. <A href="http://www.oreillynet.com/rss/">RSS</A> and <A href="http://www.foaf-project.org/">FOAF</A> and other yet-to-be-determined social software data protocols will become standards because it simply makes good sense for them to be standardized. Anyone paying attention to the unbelievable development and adoption curve of wireless can appreciate the immense value driven by standards -- and, in particular, standards that are truly standard. So it came as a bit of a shock to me that when I questioned the panelists on the implications of RSS and the Semantic Web, they were less sold on the inevitability of it all. </P>
<P>When asked the question of whether the proliferation of RSS and FOAF might make it possible for reader technology to be the next killer application in knowledge management, I got very strong reactions from both Reid Hoffman and Meg Hourihan. Reid stated that he did not believe that RSS was sufficiently robust to provide significant value an any level. Meg followed up with a general indictment of the semantic web, which she views merely as a geek utopia. I will admit that I'm a fan of Candide (particularly at the hands of <A href="http://www.leonardbernstein.com/">Bernstein</A>), but I hardly view myself as Panglos. One need look no further than, for example, the tools that <A href="http://www.oddpost.com/learnmore.html">Oddpost</A> has incorporated into its web email client to allow an integrated email and blog experience. Better yet, through a relatively simple web service, Oddpost can deliver an RSS feed of a particular Google News search so that you can keep track of keywords that are of interest to you without having to visit Google repeatedly to find out if your company or candidate or favorite band has been mentioned in today's news. The same is true of watch lists on <A href="http://www.technorati.com/watchlists/index.html">Technorati</A>. Rather than periodically check to see if someone has linked to your blog, Technorati will do the work for you and deliver the info to your inbox only when there is information to be delivered. These examples are just the tip of the iceberg but the demonstrate the nascent power of RSS and related standards. I'll have to wait for another panel to have that argument with Reid and Meg. </P>
<DIV align=right>[via <A href="http://www.ventureblog.com/">VentureBlog</A>]
<DIV></DIV></DIV>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-09-25#373">
  <rss:title>Jeff Bezos Comments about Web Services</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-09-25T18:48:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">The following excerpt from a recent BusinessWeek interview with Jeff Bezos demonstrates how important the &quot;Executable Web&quot; aspect of Web 2.0 (next generation Web comprising two complimentary tracks: Executable Web of Web Services and Syndicated Web or XML based content such as RSS, RDF, OPML, OCS, FOAF etc.). Q: Amazon.com now runs sites and on-line operations for retailers such as Target and Toys &#39;R&#39; Us. What&#39;s the future for that services business? A: It&#39;s a rapidly growing part of our business. And that goes from [large] companies that are customers of that all the way down to individuals using our Web services to tap into the fundamental platform that is Amazon.com. They can build their own applications very effectively. It&#39;s almost closer to an ecosystem. Q: So Amazon is becoming a kind of software platform a bit like Microsoft (MSFT )? A: People are building stuff that surprises us. That&#39;s what&#39;s so interesting about this. We&#39;ve built this big base of technology to serve ourselves, and now we&#39;re opening it up and letting people access it. They&#39;re taking these fundamental pieces and building completely new things that not only would we have never gotten around to but in some cases maybe never even have thought of. There are thousands of developers who are building applications using Amazon Web services. The sky&#39;s the limit on their creativity. Q: What arises from all those efforts? A: People will be able to build very powerful applications by hooking together a whole bunch of Web services from a whole bunch of different companies. Q: What benefit is Amazon.com getting from this? A: It&#39;s too early to say. It&#39;s certainly not a major source of revenue for us. But when people use our Web services, they give us credit for that. That turns out to be very helpful. A few years ago the race was on to simply have a Web Site, then this requirement evolved into a requirement for a database driven site. Today we are seeing the final stages of the Web 2.0 inflection which will inevitably change the focus toward the need for a Point of Presence on the Web for exposing or invoking Web Services and/or Syndicating or Subscribing to XML based content.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[The following excerpt from a recent <a href="http://yahoo.businessweek.com/magazine/content/03_39/b3851607.htm">BusinessWeek interview with Jeff Bezos</a> demonstrates how important the "Executable Web" aspect of Web 2.0 (next generation Web comprising two complimentary tracks: Executable Web of Web Services and Syndicated Web or XML based content such as <a href="http://blogs.law.harvard.edu/tech/rss">RSS</a>, <a href="http://www.w3.org/RDF/">RDF</a>, <a href="http://www.opml.org/">OPML</a>, <a href="http://internetalchemy.org/ocs/">OCS</a>, <a href="http://www.foaf-project.org/">FOAF</a> etc.).
<blockquote>Q: Amazon.com now runs sites and on-line operations for retailers such as Target and Toys 'R' Us. What's the future for that services business?
A: It's a rapidly growing part of our business. And that goes from [large] companies that are customers of that all the way down to individuals using our Web services to tap into the fundamental platform that is Amazon.com. They can build their own applications very effectively. It's almost closer to an ecosystem.

Q: So Amazon is becoming a kind of software platform a bit like Microsoft (MSFT )?
A: People are building stuff that surprises us. That's what's so interesting about this. We've built this big base of technology to serve ourselves, and now we're opening it up and letting people access it.

They're taking these fundamental pieces and building completely new things that not only would we have never gotten around to but in some cases maybe never even have thought of. There are thousands of developers who are building applications using Amazon Web services. The sky's the limit on their creativity.

Q: What arises from all those efforts?
A: People will be able to build very powerful applications by hooking together a whole bunch of Web services from a whole bunch of different companies.

Q: What benefit is Amazon.com getting from this?
A: It's too early to say. It's certainly not a major source of revenue for us. But when people use our Web services, they give us credit for that. That turns out to be very helpful.
</blockquote>
A few years ago the race was on to simply have a Web Site, then this requirement evolved into a requirement for a database driven site. Today we are seeing the final stages of the Web 2.0 inflection which will inevitably change the focus toward the need for a Point of Presence on the Web for exposing or invoking Web Services and/or Syndicating or Subscribing to XML based content.
]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-22#245">
  <rss:title>The Well-Formed Web </rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-08-22T18:32:02Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">I just came across this article while brainstorming about the Comment API and it&#39;s potential use (subject of another post as this is being implemented as I write) within Blog Clients (RSS Aggregators and Readers). Back to the article. This is an essay by George Gregorio who is so into auto discovery that he deliberately stuffed his contact details in an FOAF file that you need to auto discover using a FOAF auto discovery aware client (e.g. FOAFnaut or the human brain for instance :-) ) . Anyway, he is an excerpt from his essay (a very good read). Over a month ago Paul Ford published a great essay entitled How Google beat Amazon and Ebay to the Semantic Web. After reading it the first time I thought it was a great introduction to the Semantic Web, an idea I had been trying to wrap my head around even since encountering RDF as it is baked into RSS 1.0. I had seen the light and bought into the promise of the Semantic Web. Time passes... With Dave Winer&#39;s floating of the idea of RSS 2.0 discussions ensue about the RDF in RSS 1.0. After spending some time badgering poor Bill Kearney for a concrete benefit of having RDF in RSS 1.0 and not getting a really satisfactory answer I went back and read Paul Ford&#39;s essay again. I wanted to get that old religious feeling back again. It didn&#39;t work. The magic was gone. Read on...  </dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<P>I just came across this article while brainstorming about the <A href="http://wellformedweb.org/story/9/#auto">Comment API</A> and it's potential use (subject of another post as this is being implemented as I write) within Blog Clients (RSS Aggregators and Readers).</P>
<P>Back to the article. This is an essay by <A href="http://bitworking.org/foaf.rdf">George Gregorio </A>who is so into auto discovery that he deliberately stuffed his contact details in an FOAF file that you need to auto discover using a FOAF auto discovery aware client&nbsp;(e.g. FOAFnaut&nbsp;or the human brain&nbsp;for instance :-) ) . Anyway, he is an excerpt from his essay (a very good read).</P>
<BLOCKQUOTE dir=ltr style="MARGIN-RIGHT: 0px">
<P>Over a month ago <A href="http://www.ftrain.com/">Paul Ford</A> published a great essay entitled <A href="http://www.ftrain.com/google_takes_all.html">How Google beat Amazon and Ebay to the Semantic Web</A>. After reading it the first time I thought it was a great introduction to the <A href="http://www.w3.org/2001/sw/">Semantic Web</A>, an idea I had been trying to wrap my head around even since encountering RDF as it is baked into <A title="RDF Site Summary" href="http://www.purl.org/rss/1.0/spec">RSS 1.0</A>. I had seen the light and bought into the promise of the Semantic Web. </P>
<P>Time passes...</P>
<P><!--StartFragment -->With Dave Winer's floating of the idea of <A href="http://backend.userland.com/rss">RSS 2.0</A> discussions ensue about the RDF in RSS 1.0. After spending some time badgering poor Bill Kearney for a <A href="http://burningbird.net/cgi-bin/mt-comments.cgi?entry_id=528">concrete benefit of having RDF in RSS 1.0</A> and not getting a really satisfactory answer I went back and read Paul Ford's essay again. I wanted to get that old religious feeling back again. It didn't work. The magic was gone. </P></BLOCKQUOTE>
<P dir=ltr><A href="http://wellformedweb.org/story/1">Read on...</A></P>
<P>&nbsp;</P>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-22#244">
  <rss:title>Cool XSL-T Tutorial</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-08-22T04:07:59Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">XSLT is one of the most powerful aspects of the entire XML value proposition (this weblog site is an example of what XML and XSLT can deliver), but is also one of the more daunting aspects (both hands-on and getting your brain wrapped around the syntax). Here is a really nice XSLT tutorial site. Demystify XSLT, and the world of XML&#39;s potential really opens up. It certainly accelerates the comprehension to the concept of generating RSS from internal data sources - bearing in mind that in the case of Virtuoso we use our in-built XSLT processor for facilitate XML-RPC to SOAP bridging, SQL-XML, RSS, OPML, RDF, FOAF, Atom|Echo, OCS feed generation amongst other things.</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[XSLT is one of the most powerful aspects of the entire XML value proposition (this weblog site is an example of what XML and XSLT can deliver), but is also one of the more daunting aspects (both hands-on and getting your brain wrapped around the syntax).  Here is a really nice <a href="http://www.zvon.org/xxl/XSLTutorial/Books/Output/example1_ch1.html">XSLT tutorial</a> site.

Demystify XSLT, and the world of XML's potential really opens up. It certainly accelerates the comprehension to the concept of <a href="http://jena.hpl.hp.com:3030/blojsom-hp/blog/technologies/blogging/metadata/?">generating RSS from internal data sources</a> - bearing in mind that in the case of <a href="http://www.openlinksw.com/virtuoso">Virtuoso</a> we use our in-built XSLT processor for facilitate XML-RPC to SOAP bridging, SQL-XML, RSS, OPML, RDF, FOAF, Atom|Echo, OCS feed generation amongst other things.]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-21#241">
  <rss:title>RSS: INJAN (It&#39;s not just about news)</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-08-21T15:41:25Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">When Virtuoso first unleashed support for XML (in-built XSL, Native XML Storage, Validating XML Parser, XPath, and XQuery) the core message was the delivery of a single server solution that would address the challenges of creating XML data. In the year 2000 the question of the shape and form of XML data was unclear to many, and reading the article below basically took me back in time to when we released Virtuoso 2.0 (we are now at release 3.0 commercially with a 3.2 beta dropping any minute). RSS is a great XML application, and it does a great job ofÂ demonstrating howÂ XML --the new data access foundation layer-- will galvanize the next generation Web (I refer to this as Web 2.0.). RSS: INJAN (It&#39;s not just about news) RSS is not just about news, according to Ian Davis on rss-dev.He presents a nice list of alternatives, which I reproduce here (and to which Iï¿½d add, of course, bibliography management) Sitemaps: one of the Sï¿½s in RSS stands for summary. A sitemap is a summary of the content on a site, the items are pages or content areas. This is clearly a non-chronological ordering of items. Is a hierarchy of RSS sitemaps implied here ï¿½ how would the linking between them work? How hard would it be to hack a web browser to pick up the RSS sitemap and display it in a sidebar when you visit the site? Small ads: also known as classifieds. These expire so thereï¿½s some kind of dynamic going on here but the ordering of items isnï¿½t necessarily chronological. How to describe the location of the seller, or the condition of the item or even the price. Not every ad is selling something ï¿½ perhaps itï¿½s to rent out a room. Personals: similar model to the small ads. No prices though (I hope). Comes with a ready made vocabulary of terms that could be converted to an RDF schema. Probably should do that just for the hell of it anyway ï¿½ gsoh Weather reports: how about a weekï¿½s worth of weather in an RSS channel. If an item is dated in the future, should an aggregator display it before time? Alternate representations include maps of temperature and pressure etc. Auctions: again, related to small ads, but these are much more time limited since there is a hard cutoff after which the auction is closed. The sequence of bids could be interesting ï¿½ would it make sense to thread them like a discussion so you can see the tactics? TV listings: this is definitely chronological but with a twist ï¿½ the items have durations. They also have other metadata such as cast lists, classification ratings, widescreen, stereo, program type. Some types have additional information such as director and production year. Top ten listings: top ten singles, books, dvds, richest people, ugliest, rear of the year etc. Not chronological, but has definate order. May update from day to day or even more often. Sales reporting: imagine if every department of a company reported their sales figures via RSS. Then the divisions aggregate the departmental figures and republish to the regional offices, who aggregate and add value up the chain. The chairman of the company subscribes to one super-aggregate feed. Membership lists / buddy lists: could I publish my buddy list from Jabber or other instant messengers? Maybe as an interchange format or perhaps could be used to look for shared contacts. Lots of potential overlap with FOAF here. Mailing lists: or in fact any messaging system such as usenet. There are some efforts at doing this already (e.g. yahoogroups) but we need more information ï¿½ threads; references; headers; links into archives. Price lists / inventory: the items here are products or services. No particular ordering but itï¿½d be nice to be able to subscribe to a catalog of products and prices from a company. The aggregator should be able to pick out price rises or bargains given enough history. [via Semantic Blogging Demonstrator] Thus, if we can comprehend RSS (the blog article below does a great job) we should be able to see the fundamental challenges that are before any organization seeking to exploit the potential of the imminent Web 2.0 inflection; how will you cost-effectively create XML data from existing data sources? Without upgrading or switching database engines, operating systems, programming languages? Put differently how can you exploit this phenomenonÂ without losing your ever dwindling technology choices (believe me choices are dwindling fast but most are oblivious to this fact). Â  xmlrsssyndication</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[
<p><span style="font-size: 10pt; font-family: Arial;">When Virtuoso first unleashed support for XML (in-built XSL, Native XML Storage, Validating XML Parser, XPath, and XQuery) the core message was the delivery of a single server solution that would address the challenges of creating XML data.</span></p><p xmlns="o"></p> <p><span style="font-size: 10pt; font-family: Arial;">In the year 2000 the question of the shape and form of XML data was unclear to many, and reading the article below basically took me back in time to when we released <a href="http://www.it-director.com/article.php?articleid=916">Virtuoso 2.0</a> (we are now at <a href="http://www.openlinksw.com/virtuoso">release 3.0</a> commercially with a <a href="http://www.openlinksw.com/press/virt32_wwdc1.htm">3.2 beta </a>dropping any minute).</span></p><p xmlns="o"></p> <p><span style="font-size: 10pt; font-family: Arial;">RSS is a great XML application, and it does a great job ofÂ demonstrating howÂ XML --the new data access foundation layer-- will galvanize the next generation Web (I refer to this as Web 2.0.). </span></p> <blockquote dir="ltr" style="margin-right: 0px;"><span style="font-size: 10pt; font-family: Arial;"> <p><a href="http://jena.hpl.hp.com:3030/blojsom-hp/blog/technologies/blogging/metadata/?permalink=1214847A10C1966396472E816A7A4243.textile">RSS: INJAN (It&#39;s not just about news)</a> </p> <p><span class="caps">RSS</span> is not just about news, according to <a href="http://groups.yahoo.com/group/rss-dev/message/5764">Ian Davis on rss-dev</a>.<br />He presents a nice list of alternatives, which I reproduce here (and to which Iï¿½d add, of course, bibliography management)</p> <ul> <li>Sitemaps: one of the Sï¿½s in <span class="caps">RSS</span> stands for summary. A sitemap is a summary of the content on a site, the items are pages or content areas. This is clearly a non-chronological ordering of items. Is a hierarchy of <span class="caps">RSS</span> sitemaps implied here ï¿½ how would the linking between them work? How hard would it be to hack a web browser to pick up the <span class="caps">RSS</span> sitemap and display it in a sidebar when you visit the site?</li> <li>Small ads: also known as classifieds. These expire so thereï¿½s some kind of dynamic going on here but the ordering of items isnï¿½t necessarily chronological. How to describe the location of the seller, or the condition of the item or even the price. Not every ad is selling something ï¿½ perhaps itï¿½s to rent out a room.</li> <li>Personals: similar model to the small ads. No prices though (I hope). Comes with a ready made vocabulary of terms that could be converted to an <span class="caps">RDF</span> schema. Probably should do that just for the hell of it anyway ï¿½ gsoh</li> <li>Weather reports: how about a weekï¿½s worth of weather in an <span class="caps">RSS</span> channel. If an item is dated in the future, should an aggregator display it before time? Alternate representations include maps of temperature and pressure etc.</li> <li>Auctions: again, related to small ads, but these are much more time limited since there is a hard cutoff after which the auction is closed. The sequence of bids could be interesting ï¿½ would it make sense to thread them like a discussion so you can see the tactics?</li> <li>TV listings: this is definitely chronological but with a twist ï¿½ the items have durations. They also have other metadata such as cast lists, classification ratings, widescreen, stereo, program type. Some types have additional information such as director and production year.</li> <li>Top ten listings: top ten singles, books, dvds, richest people, ugliest, rear of the year etc. Not chronological, but has definate order. May update from day to day or even more often.</li> <li>Sales reporting: imagine if every department of a company reported their sales figures via <span class="caps">RSS</span>. Then the divisions aggregate the departmental figures and republish to the regional offices, who aggregate and add value up the chain. The chairman of the company subscribes to one super-aggregate feed.</li> <li>Membership lists / buddy lists: could I publish my buddy list from Jabber or other instant messengers? Maybe as an interchange format or perhaps could be used to look for shared contacts. Lots of potential overlap with <span class="caps">FOAF</span> here.</li> <li>Mailing lists: or in fact any messaging system such as usenet. There are some efforts at doing this already (e.g. yahoogroups) but we need more information ï¿½ threads; references; headers; links into archives.</li> <li>Price lists / inventory: the items here are products or services. No particular ordering but itï¿½d be nice to be able to subscribe to a catalog of products and prices from a company. The aggregator should be able to pick out price rises or bargains given enough history.</li> <div align="right">[via <a href="http://jena.hpl.hp.com:3030/blojsom-hp/blog/">Semantic Blogging Demonstrator</a>] </div></ul></span></blockquote> <p><span style="font-size: 10pt; font-family: Arial;">Thus, if we can comprehend RSS (the blog article below does a great job) we should be able to see the fundamental challenges that are before any organization seeking to exploit the potential of the imminent Web 2.0 inflection; how will you cost-effectively create XML data from existing data sources? Without upgrading or switching database engines, operating systems, programming languages? Put differently how can you exploit this phenomenonÂ without losing your ever dwindling technology choices (believe me choices are dwindling fast but most are oblivious to this fact).</span></p><p xmlns="o"></p> <p>Â </p>
<a href="index.vspx?tag=xml" rel="tag" style="display:none;">xml</a><a href="index.vspx?tag=rss" rel="tag" style="display:none;">rss</a><a href="index.vspx?tag=syndication" rel="tag" style="display:none;">syndication</a>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-08-05#231">
  <rss:title>Do We Need the Semantic Web?</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-08-05T15:43:30Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">0</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[0]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-05-31#347">
  <rss:title>Semantic Web Client UI Diagram</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-05-31T22:08:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Semantic Web Client UI Diagram I&#39;m getting really excited by this Semantic Web stuff I&#39;m doing. here&#39;s a screenshot / diagram of how it works to display some dynamic UI based on FOAF, RSS, and some movie information. The UI is written using a number of small (less than 1K) XUL and XBL files, although any kind of XML file can theoretically be used. [via Neil&#39;s Place] This is simply cool!</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<a href="http://www.xulplanet.com/cgi-bin/ndeakin/homeN.cgi?ai=136">Semantic Web Client UI Diagram</a> I&#39;m getting really excited by this Semantic Web stuff I&#39;m doing. here&#39;s a screenshot / diagram of how it works to display some dynamic UI based on FOAF, RSS, and some movie information. The UI is written using a number of small (less than 1K) XUL and XBL files, although any kind of XML file can theoretically be used. 
<div align="right">[via <a href="http://www.xulplanet.com/ndeakin/">Neil&#39;s Place</a>]
<div></div></div><em>This is simply cool!</em>]]></content:encoded>
 </rss:item>
 <rss:item xmlns:rss="http://purl.org/rss/1.0/" rdf:about="http://www.openlinksw.com/blog/kidehen@openlinksw.com/blog/?date=2003-05-31#78">
  <rss:title>Semantic Web Client UI Diagram</rss:title>
  <dc:date xmlns:dc="http://purl.org/dc/elements/1.1/">2003-05-31T22:08:00Z</dc:date>
  <dc:description xmlns:dc="http://purl.org/dc/elements/1.1/">Semantic Web Client UI Diagram I&#39;m getting really excited by this Semantic Web stuff I&#39;m doing. here&#39;s a screenshot / diagram of how it works to display some dynamic UI based on FOAF, RSS, and some movie information. The UI is written using a number of small (less than 1K) XUL and XBL files, although any kind of XML file can theoretically be used. [via Neil&#39;s Place] This is simply cool!</dc:description>
  <content:encoded xmlns:content="http://purl.org/rss/1.0/modules/content/"><![CDATA[<A href="http://www.xulplanet.com/cgi-bin/ndeakin/homeN.cgi?ai=136">Semantic Web Client UI Diagram</A> I'm getting really excited by this Semantic Web stuff I'm doing. here's a screenshot / diagram of how it works to display some dynamic UI based on FOAF, RSS, and some movie information. The UI is written using a number of small (less than 1K) XUL and XBL files, although any kind of XML file can theoretically be used. 
<DIV align=right>[via <A href="http://www.xulplanet.com/ndeakin/">Neil's Place</A>]
<DIV></DIV></DIV><EM>This is simply cool!</EM>]]></content:encoded>
 </rss:item>
</rdf:RDF>