Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
context="sys/context/base.jsonld",
system_base_iri="",
union="common.jsonld.lines",
last_backwards_id_time="2022-10-14T16:26:16Z"
)

if __name__ == "__main__":
Expand Down
60 changes: 45 additions & 15 deletions lxltools/datacompiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ def __init__(self, *,
context=None,
record_thing_link='mainEntity',
system_base_iri=None,
union='all.jsonld.lines'):
union='all.jsonld.lines',
last_backwards_id_time=None):
self.datasets_description = datasets_description
self.datasets = {}
self.current_ds_resources = set()
Expand All @@ -49,6 +50,11 @@ def __init__(self, *,
self.current_ds_file = None
self.no_records = False

self.last_backwards_id_time = (
timeutil.w3c_dtz_to_ms(last_backwards_id_time)
if isinstance(last_backwards_id_time, str)
else None)

if datasets_description:
self._handlers_from_datasets_description(datasets_description)

Expand Down Expand Up @@ -155,7 +161,8 @@ def _compile_dataset(self, name, result):
data = self.to_jsonld(data)

ds_url = urljoin(self.dataset_id, name)
self._create_dataset_description(ds_url, ds_created_ms, ds_modified_ms)
self._create_dataset_description(
ds_url, ds_created_ms, ds_created_ms=ds_created_ms)

base_id = urljoin(self.dataset_id, base)

Expand All @@ -172,10 +179,6 @@ def _compile_dataset(self, name, result):
modified_ms = None
fpath = urlparse(nodeid).path[1:]

if self.no_records:
self.write(node, fpath)
continue

meta = node.pop('meta', None)
if meta:
if 'created' in meta:
Expand All @@ -189,10 +192,25 @@ def _compile_dataset(self, name, result):
node,
created_ms,
modified_ms,
datasets=[self.dataset_id, ds_url])
self.write(desc, fpath)
datasets=[self.dataset_id, ds_url],
ds_created_ms=ds_created_ms)

# Keep sameAs "fowards" form in meta even if no_records is used
Comment thread
olovy marked this conversation as resolved.
Outdated
if self.no_records:
meta = meta or {}
sameas = meta.setdefault('sameAs', [])
rec = desc['@graph'][0]
if 'sameAs' in rec:
sameas.append({"@id": rec['@id']})
for same in rec.get('sameAs', []):
sameas.append(same)
node['meta'] = meta
self.write(node, fpath)
else:
self.write(desc, fpath)

def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, label=None):
def _create_dataset_description(self, ds_url, created_ms, modified_ms=None,
label=None, ds_created_ms=None):
if not label:
label = ds_url.rsplit('/', 1)[-1]
ds = {
Expand All @@ -211,7 +229,7 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
return

desc = self._to_node_description(ds, created_ms, modified_ms,
datasets={self.dataset_id, ds_url})
datasets={self.dataset_id, ds_url}, ds_created_ms=ds_created_ms)

record = desc['@graph'][0]
if self.tool_id:
Expand All @@ -220,14 +238,16 @@ def _create_dataset_description(self, ds_url, created_ms, modified_ms=None, labe
self.write(desc, ds_path)

def _to_node_description(self, node, created_ms,
modified_ms=None, datasets=None):
modified_ms=None, datasets=None, ds_created_ms=None):
assert self.record_thing_link not in node

node_id = node['@id']

record = OrderedDict()
record['@type'] = 'Record'
record['@id'] = self.generate_record_id(created_ms, node_id)

self.set_record_id(record, created_ms, node_id, ds_created_ms)

record[self.record_thing_link] = {'@id': node_id}

# Add provenance
Expand All @@ -241,9 +261,19 @@ def _to_node_description(self, node, created_ms,

return {'@graph': items}

def generate_record_id(self, created_ms, node_id):
# FIXME: backwards_form=created_ms < 2015
slug = lxlslug.librisencode(created_ms, lxlslug.checksum(node_id))
def set_record_id(self, record, created_ms, node_id, ds_created_ms=None):
if ds_created_ms is None:
ds_created_ms = created_ms
backwards_form = ds_created_ms < self.last_backwards_id_time
# TODO: use normal form and keep backwards_form as sameAs until "GC:able"?
record['@id'] = self.generate_record_id(created_ms, node_id, backwards_form)
if backwards_form:
record['sameAs'] = [{'@id': self.generate_record_id(created_ms, node_id)}]

def generate_record_id(self, created_ms, node_id, backwards_form=False):
slug = lxlslug.librisencode(
created_ms, lxlslug.checksum(node_id), backwards_form=backwards_form
)
return urljoin(self.system_base_iri, slug)

def write(self, node, name):
Expand Down
12 changes: 7 additions & 5 deletions lxltools/lxlslug.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#!/usr/bin/env python
from __future__ import unicode_literals, print_function

from typing import Any
from zlib import crc32
import string
import time
Expand Down Expand Up @@ -32,9 +31,12 @@ def rotate(c):
def checksum(data):
return crc32(data.encode('utf-8')) & 0xffffffff

def librisencode(a, b):
def librisencode(a, b, backwards_form=False):
alphabet = lower_consonants_numbers
timepart = "".join(reversed(caesarize(alphabet, tobase(alphabet, a))))
chars = caesarize(alphabet, tobase(alphabet, a))
if backwards_form:
chars = reversed(chars)
timepart = "".join(chars)
codepart = tobase(alphabet, b)
codelen = len(codepart)
if codelen < 7:
Expand All @@ -53,7 +55,7 @@ def librisencode(a, b):
print("Usage: %s TIMESTAMP IDENTIFIER" % (cmd), file=sys.stderr)
exit(1)

timestamp = args.pop(0)
timestamp: Any = args.pop(0)
identifiers = args

try:
Expand Down
5 changes: 3 additions & 2 deletions syscore.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ def _get_repo_version():
context='sys/context/base.jsonld',
record_thing_link='mainEntity',
system_base_iri='',
union='syscore.jsonld.lines')
union='syscore.jsonld.lines',
last_backwards_id_time='2022-11-20T00:00:00Z')


@compiler.handler
Expand Down Expand Up @@ -145,7 +146,7 @@ def _insert_record(graph, created_ms, dataset_id):
record = {'@type': 'SystemRecord'}
record[compiler.record_thing_link] = {'@id': entity['@id']}
graph.insert(0, record)
record['@id'] = compiler.generate_record_id(created_ms, entity['@id'])
compiler.set_record_id(record, created_ms, entity['@id'])
record['inDataset'] = [{'@id': compiler.dataset_id}, {'@id': dataset_id}]


Expand Down