1. okfn
  2. jiscobib

Commits

beno  committed 1e229e4

Fixing some issues with the export subgraphs - adding an additional export file which avoids duplicated entity quads (in BNBrdfdcXX.lean.nquads)

Comments (0)

Files changed (1)

File jiscobib/bltonquads.py Modified

View file
  • Ignore whitespace
  • Hide word diff
     # append the predicate:
     lines.append(line)
     
+    minteduri = ""
     # this time, won't be a bnode:
     eid = kvs.get(hash_val)
     if eid is None:
         eid = "E%d" % kvs.incr("entity")
         kvs.set(hash_val, eid)
+        minteduri = ENTITY[eid]
 
     lines.append(u'      <foaf:Agent rdf:about="%s">' % ENTITY[eid])
     # append the canonical version as an identifier
   else:
     # Null value
     log.warn("Couldn't get literal value for %s - %s" % (line, (bnst, literal, bnend, endtag)))
+  return minteduri
 
 def promote_series(fh, lines, line, b_uri):
   bnst, literal, bnend, endtag = [fh.readline().rstrip() for x in xrange(4)]
     
     # append the predicate:
     lines.append(line)
-   
+    
+    minteduri = ""
     sid = kvs.get(hash_val)
     if sid is None:
         sid = "S%d" % kvs.incr("series")
         kvs.set(hash_val, sid)
+        minteduri = SERIES[sid]
  
     # this time, won't be a bnode, but a bibo:Document:
     lines.append(u'      <bibo:Series rdf:about="%s">' % SERIES[sid])
   else:
     # Null value here.
     log.warn("Couldn't get literal value for %s - %s" % (line, (bnst, literal, bnend, endtag)))
+  
+  return minteduri
 
 def make_value_title(fh, lines, line, b_uri):
   bnst, literal, bnend, endtag = [fh.readline().rstrip() for x in xrange(4)]
       
       fh = codecs.open(fname, "r", "utf-8" )  # Assuming UTF-8
       fo = codecs.open(fname+".nquads", "w", "utf-8") # export to nquads
+      flean = codecs.open(fname+".lean.nquads", "w", "utf-8") # export to lean nquads (no repeated entity quads)
 
       # first two lines are the xml declaration and the <rdf:RDF ....> element
       xmldec = fh.readline()
       start_t = time()
       process_times = []
       lines = [head]
+      minteduris = []
       line = fh.readline().rstrip()  # remove EOL /r/n or /n
       b_uri = get_uri(file_no, record)
       bnb_id = None
           # read in and convert to nt
           g = Graph(identifier=ident)
           g.parse(data=rdf, format="application/rdf+xml")
-          for subj in g.distinct_subjects(RDF["type"]):
-               if not isinstance(subj, URIRef):
-                   continue
-               if subj == ident:
-                   sg = Graph()
-                   sg += g
-               else:
-                   sg = g.bnc((subj, None, None))
-               for ext in ("rdf", "nt", "ttl", "n3", "html"):
-                   doc = URIRef(subj + "." + ext)
-                   sg.add((subj, FOAF["isPrimaryTopicOf"], doc))
-                   sg.add((doc, RDF["type"], FOAF["Document"]))
-                   sg.add((doc, FOAF["primaryTopic"], subj))
+          for subj in set(g.distinct_subjects(RDF["type"])):
+            if not isinstance(subj, URIRef):
+              continue
+            elif subj == ident:
+              sg = Graph()
+              sg += g
+            else:
+              sg = g.bnc((subj, None, None))
+            for ext in ("rdf", "nt", "ttl", "n3", "html"):
+              doc = URIRef(subj + "." + ext)
+              sg.add((subj, FOAF["isPrimaryTopicOf"], doc))
+              sg.add((doc, RDF["type"], FOAF["Document"]))
+              sg.add((doc, FOAF["primaryTopic"], subj))
 
-               nt = sg.serialize(format="nt")
+            nt = sg.serialize(format="nt")
           
-               # NOTE: Should the graph URI be the nt document's one, some other format, or the abstract root URI? [Ben's question to self...]
-               nq = nt.replace(u".\n", u"%s .\n" % subj.n3())
+            # NOTE: Should the graph URI be the nt document's one, some other format, or the abstract root URI? [Ben's question to self...]
+            nq = nt.replace(u".\n", u"%s .\n" % subj.n3())
+            fo.write(nq)
+            if subj in minteduris + [ident]:
+              flean.write(nq)
+                   
 
           # output the result
           #for exportline in nq.split("\n"):
           #  if exportline:  # skip empty lines
           #    fo.write(exportline + u"\n")
-          fo.write(nq)
+          #fo.write(nq)
           lines = []
+          minteduris = []
           record = record + 1
           bnb_id = None
           uk_id = None
                       u"    <dcterms:publisher>"
                       ]:
           # turns a predicate -> bnode -> rdf:value into predicate -> minted hashed URI foaf:Agent -> name
-          promote_literal_agent(fh, lines, line, b_uri)
+          uri = promote_literal_agent(fh, lines, line, b_uri)
+          if uri:
+            minteduris.append(uri)
         elif line == u"    <dcterms:type>":
           handle_type(fh, lines, line)
         elif line in [u"    <dcterms:isPartOf>"]:
 #                      ]:
           # turns a predicate -> bnode -> rdf:value into predicate -> minted hashed URI bibo:Document -> title
           #promote_literal_work(fh, lines, line, b_uri) # these are all other Works of some kind, I believe
-          promote_series(fh, lines, line, b_uri)
+          uri = promote_series(fh, lines, line, b_uri)
+          if uri:
+            minteduris.append(uri)
         elif line in [u"    <dcterms:replaces>",
                       u"    <dcterms:isReplacedBy>"]:
           make_value_title(fh, lines, line, b_uri)