dataaccess¶
A module for providing access to data based on data documentation from the datasets module.
High-level functions for accessing and storing actual data:
load()
: Load documented dataset from its source.save()
: Save documented dataset to a data resource.
Note
This module may eventually be moved out of tripper into a separate package.
load(ts, iri, distributions=None, use_sparql=None)
¶
Load dataset with given IRI from its source.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ts |
Triplestore |
Triplestore documenting the data to load. |
required |
iri |
str |
IRI of the data to load. |
required |
distributions |
Optional[Union[str, Sequence[str]]] |
Name or sequence of names of distribution(s) to try in case the dataset has multiple distributions. The default is to try all documented distributions. |
None |
use_sparql |
Optional[bool] |
Whether to access the triplestore with SPARQL.
Defaults to |
None |
Returns:
Type | Description |
---|---|
bytes |
Bytes object with the underlying data. |
Note
For now this requires DLite.
Source code in tripper/dataset/dataaccess.py
def load(
ts: Triplestore,
iri: str,
distributions: "Optional[Union[str, Sequence[str]]]" = None,
use_sparql: "Optional[bool]" = None,
) -> bytes:
"""Load dataset with given IRI from its source.
Arguments:
ts: Triplestore documenting the data to load.
iri: IRI of the data to load.
distributions: Name or sequence of names of distribution(s) to
try in case the dataset has multiple distributions. The
default is to try all documented distributions.
use_sparql: Whether to access the triplestore with SPARQL.
Defaults to `ts.prefer_sparql`.
Returns:
Bytes object with the underlying data.
Note:
For now this requires DLite.
"""
# pylint: disable=import-outside-toplevel
# Use the Protocol plugin system from DLite. Should we move it to tripper?
import dlite
from dlite.protocol import Protocol
dct = load_dict(ts, iri=iri, use_sparql=use_sparql)
if DCAT.Dataset not in get(dct, "@type"):
raise TypeError(
f"expected IRI '{iri}' to be a dataset, but got: "
f"{', '.join(get(dct, '@type'))}"
)
if distributions is None:
distributions = get(dct, "distribution")
for dist in distributions:
url = dist.get("downloadURL", dist.get("accessURL")) # type: ignore
if url:
p = urlparse(url)
# Mapping of supported schemes - should be moved into the protocol
# module.
schemes = {
"https": "http",
}
scheme = schemes.get(p.scheme, p.scheme) if p.scheme else "file"
location = (
f"{scheme}://{p.netloc}{p.path}"
if p.netloc
else f"{scheme}:{p.path}"
)
id = (
dist.accessService.get("identifier") # type: ignore
if "accessService" in dist
else None
)
try:
with Protocol(scheme, location, options=p.query) as pr:
return pr.load(id)
# pylint: disable=no-member
except (dlite.DLiteProtocolError, dlite.DLiteIOError):
pass
raise IOError(f"Cannot access dataset: {iri}")
save(ts, data, class_iri=None, dataset=None, distribution=None, generator=None, prefixes=None, use_sparql=None)
¶
Saves data to a dataresource and document it in the triplestore.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
ts |
Triplestore |
Triplestore that documents the data to save. |
required |
data |
bytes |
Bytes representation of the data to save. |
required |
class_iri |
Optional[str] |
IRI of a class in the ontology (e.g. an |
None |
dataset |
Optional[Union[str, dict]] |
Either the IRI of the dataset individual standing for
the data to be saved or or a dict that in addition to the IRI
('@id' keyword) can provide with additional documentation of
the dataset.
If |
None |
distribution |
Optional[Union[str, dict]] |
Either the IRI of distribution for the data to be saved
or a dict additional documentation of the distribution,
like media type, parsers, generators etc...
If |
None |
generator |
Optional[str] |
Name of generator to use in case the distribution has several generators. |
None |
prefixes |
Optional[dict] |
Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. |
None |
use_sparql |
Optional[bool] |
Whether to access the triplestore with SPARQL.
Defaults to |
None |
Returns:
Type | Description |
---|---|
str |
IRI of the dataset. |
Source code in tripper/dataset/dataaccess.py
def save(
ts: Triplestore,
data: bytes,
class_iri: "Optional[str]" = None,
dataset: "Optional[Union[str, dict]]" = None,
distribution: "Optional[Union[str, dict]]" = None,
generator: "Optional[str]" = None,
prefixes: "Optional[dict]" = None,
use_sparql: "Optional[bool]" = None,
) -> str:
"""Saves data to a dataresource and document it in the triplestore.
Arguments:
ts: Triplestore that documents the data to save.
data: Bytes representation of the data to save.
class_iri: IRI of a class in the ontology (e.g. an `emmo:DataSet`
subclass) that describes the dataset that is saved.
Is used to select the `distribution` if that is not given.
If `distribution` is also given, a
`dcat:distribution value <distribution>` restriction will be
added to `class_iri`
dataset: Either the IRI of the dataset individual standing for
the data to be saved or or a dict that in addition to the IRI
('@id' keyword) can provide with additional documentation of
the dataset.
If `dataset` is None, a new blank node IRI will be created.
distribution: Either the IRI of distribution for the data to be saved
or a dict additional documentation of the distribution,
like media type, parsers, generators etc...
If `distribution` is None and dataset is not a dict with a
'distribution' keyword, a new distribution will be added
to the dataset.
generator: Name of generator to use in case the distribution has
several generators.
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
use_sparql: Whether to access the triplestore with SPARQL.
Defaults to `ts.prefer_sparql`.
Returns:
IRI of the dataset.
"""
# pylint: disable=too-many-locals,too-many-branches,too-many-statements
# pylint: disable=import-outside-toplevel
# Use the Protocol plugin system from DLite. Should we move it to tripper?
from dlite.protocol import Protocol
triples = []
save_dataset = save_distribution = False
if dataset is None:
# __TODO__: Infer dataset from value restriction on `class_iri`
# This require that we make a SPARQL-version of ts.restriction().
newiri = f"_:N{secrets.token_hex(16)}"
typeiri = [DCAT.Dataset, class_iri] if class_iri else DCAT.Dataset
dataset = AttrDict({"@id": newiri, "@type": typeiri})
save_dataset = True
elif isinstance(dataset, str):
dset = load_dict(ts, iri=dataset, use_sparql=use_sparql)
if dset:
dataset = dset
else:
typeiri = [DCAT.Dataset, class_iri] if class_iri else DCAT.Dataset
dataset = AttrDict({"@id": dataset, "@type": typeiri})
save_dataset = True
elif isinstance(dataset, dict):
save_dataset = True
else:
raise TypeError(
"if given, `dataset` should be either a string or dict"
)
dataset: dict # Tell mypy that this now is a dict
if distribution is None:
if "distribution" in dataset:
distribution = get(dataset, "distribution")[0]
else:
newiri = f"_:N{secrets.token_hex(16)}"
distribution = AttrDict(
{"@id": newiri, "@type": DCAT.Distribution}
)
add(dataset, "distribution", distribution)
triples.append((dataset["@id"], DCAT.distribution, newiri))
save_distribution = True
if isinstance(distribution, str):
distr = load_dict(ts, iri=distribution, use_sparql=use_sparql)
if distr:
distribution = distr
else:
distribution = AttrDict(
{"@id": distribution, "@type": DCAT.Distribution}
)
add(dataset, "distribution", distribution)
triples.append((dataset["@id"], DCAT.distribution, newiri))
save_distribution = True
elif isinstance(distribution, dict):
add(dataset, DCAT.distribution, distribution)
if "@id" in distribution:
triples.append(
(dataset["@id"], DCAT.distribution, distribution["@id"])
)
save_distribution = True
else:
raise TypeError(
"if given, `distribution` should be either a string or dict"
)
distribution: dict # Tell mypy that this now is a dict
if isinstance(generator, str):
for gen in get(distribution, "generator"):
if gen.get("@id") == generator:
break
else:
raise ValueError(
f"dataset '{dataset}' has no such generator: {generator}"
)
elif "generator" in distribution:
gen = get(distribution, "generator")[0]
else:
gen = None
# __TODO__: Check if `class_iri` already has the value restriction.
# If not, add it to triples
# __TODO__: Move this mapping of supported schemes into the protocol
# module.
schemes = {
"https": "http",
}
# Save data
url = distribution.get("downloadURL", distribution.get("accessURL"))
p = urlparse(url)
scheme = schemes.get(p.scheme, p.scheme) if p.scheme else "file"
location = (
f"{scheme}://{p.netloc}{p.path}" if p.netloc else f"{scheme}:{p.path}"
)
options = [p.query] if p.query else []
if gen and "configuration" in gen and "options" in gen.configuration:
# __TODO__: allow options to also be a dict
options.append(gen.configuration["options"])
id = (
distribution["accessService"].get("identifier")
if "accessService" in distribution
else None
)
with Protocol(scheme, location, options=";".join(options)) as pr:
pr.save(data, id)
# Update triplestore
ts.add_triples(triples)
if save_dataset:
save_dict(ts, dataset, "dataset", prefixes=prefixes)
elif save_distribution:
save_dict(ts, distribution, "distribution", prefixes=prefixes)
return dataset["@id"]