dataset¶

Module for documenting datasets with Tripper.

The dataset documentation follows the DCAT structure and is exposed as Python dicts with attribute access in this module. The semantic meaning of the keywords in this dict are defined by a JSON-LD context.

High-level function for populating the triplestore from YAML documentation:

save_datadoc(): Save documentation from YAML file to the triplestore.

Functions for searching the triplestore:

search_iris(): Get IRIs of matching entries in the triplestore.

Functions for working with the dict-representation:

read_datadoc(): Read documentation from YAML file and return it as dict.
save_dict(): Save dict documentation to the triplestore.
load_dict(): Load dict documentation from the triplestore.
as_jsonld(): Return the dict as JSON-LD (represented as a Python dict)

Functions for interaction with OTEAPI:

get_partial_pipeline(): Returns a OTELib partial pipeline.

`add(d, key, value)` ¶

Append key-value pair to dict d.

If key already exists in d, its value is converted to a list and value is appended to it. value may also be a list. Values are not duplicated.

Source code in tripper/dataset/dataset.py

def add(d: dict, key: str, value: "Any") -> None:
    """Append key-value pair to dict `d`.

    If `key` already exists in `d`, its value is converted to a list
    and `value` is appended to it.  `value` may also be a list. Values
    are not duplicated.

    """
    if key not in d:
        d[key] = value
    else:
        klst = d[key] if isinstance(d[key], list) else [d[key]]
        if isinstance(value, dict):
            v = klst if value in klst else klst + [value]
        else:
            vlst = value if isinstance(value, list) else [value]
            try:
                v = list(set(klst).union(vlst))
            except TypeError:  # klst contains unhashable dicts
                v = klst + [x for x in vlst if x not in klst]
        d[key] = (
            v[0]
            if len(v) == 1
            else sorted(
                # Sort dicts at end, by representing them with a huge
                # unicode character
                v,
                key=lambda x: "\uffff" if isinstance(x, dict) else x,
            )
        )

`addnested(d, key, value)` ¶

Like add(), but allows key to be a dot-separated list of sub-keys. Returns the updated d.

Each sub-key will be added to d as a corresponding sub-dict.

Examples:

>>> d = {}
>>> addnested(d, "a.b.c", "val") == {'a': {'b': {'c': 'val'}}}
True

Source code in tripper/dataset/dataset.py

def addnested(
    d: "Union[dict, list]", key: str, value: "Any"
) -> "Union[dict, list]":
    """Like add(), but allows `key` to be a dot-separated list of sub-keys.
    Returns the updated `d`.

    Each sub-key will be added to `d` as a corresponding sub-dict.

    Example:

        >>> d = {}
        >>> addnested(d, "a.b.c", "val") == {'a': {'b': {'c': 'val'}}}
        True

    """
    if "." in key:
        first, rest = key.split(".", 1)
        if isinstance(d, list):
            for ele in d:
                if isinstance(ele, dict):
                    addnested(ele, key, value)
                    break
            else:
                d.append(addnested({}, key, value))
        elif first in d and isinstance(d[first], (dict, list)):
            addnested(d[first], rest, value)
        else:
            addnested(d, first, addnested(AttrDict(), rest, value))
    elif isinstance(d, list):
        for ele in d:
            if isinstance(ele, dict):
                add(ele, key, value)
                break
        else:
            d.append({key: value})
    else:
        add(d, key, value)
    return d

`as_jsonld(dct, type='dataset', prefixes=None, **kwargs)` ¶

Return an updated copy of dict dct as valid JSON-LD.

Parameters:

Name	Type	Description	Default
`dct`	`dict`	Dict documenting a resource to be represented as JSON-LD.	required
`type`	`Optional[str]`	Type of data to document. Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset".	`'dataset'`
`prefixes`	`Optional[dict]`	Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs.	`None`
`kwargs`		Additional keyword arguments to add to the returned dict. A leading underscore in a key will be translated to a leading "@"-sign. For example, "@id", "@type" or "@context" may be provided as "_id" "_type" or "_context", respectively.	`{}`

Returns:

Type	Description
`dict`	An updated copy of `dct` as valid JSON-LD.

Source code in tripper/dataset/dataset.py

def as_jsonld(
    dct: dict,
    type: "Optional[str]" = "dataset",
    prefixes: "Optional[dict]" = None,
    **kwargs,
) -> dict:
    """Return an updated copy of dict `dct` as valid JSON-LD.

    Arguments:
        dct: Dict documenting a resource to be represented as JSON-LD.
        type: Type of data to document.  Should either be one of the
            pre-defined names: "dataset", "distribution", "accessService",
            "parser" and "generator" or an IRI to a class in an ontology.
            Defaults to "dataset".
        prefixes: Dict with prefixes in addition to those included in the
            JSON-LD context.  Should map namespace prefixes to IRIs.
        kwargs: Additional keyword arguments to add to the returned
            dict.  A leading underscore in a key will be translated to
            a leading "@"-sign.  For example, "@id", "@type" or
            "@context" may be provided as "_id" "_type" or "_context",
            respectively.

    Returns:
        An updated copy of `dct` as valid JSON-LD.

    """
    # pylint: disable=too-many-branches

    # Id of base entry that is documented
    _entryid = kwargs.pop("_entryid", None)

    context = kwargs.pop("_context", None)

    d = AttrDict()
    if not _entryid:
        d["@context"] = CONTEXT_URL
        if context:
            add(d, "@context", context)

    if type:
        t = dicttypes[type]["@type"] if type in dicttypes else type
        add(d, "@type", t)  # get type at top
        d.update(dct)
        add(d, "@type", t)  # readd type if overwritten
    else:
        d.update(dct)

    for k, v in kwargs.items():
        key = f"@{k[1:]}" if re.match("^_([^_]|([^_].*[^_]))$", k) else k
        add(d, key, v)

    if "@id" not in d and not _entryid:
        raise ValueError("Missing '@id' in dict to document")

    if not _entryid:
        _entryid = d["@id"]

    if "@type" not in d:
        warnings.warn(f"Missing '@type' in dict to document: {_entryid}")

    all_prefixes = get_prefixes(context=context)
    if prefixes:
        all_prefixes.update(prefixes)

    # Recursively expand IRIs and prepare sub-directories
    # Nested lists are not supported
    nested = dicttypes.keys()
    for k, v in d.items():
        if k == "mappingURL":
            for url in get(d, k):
                with Triplestore("rdflib") as ts2:
                    ts2.parse(url, format=d.get("mappingFormat"))
                    if "statements" in d:
                        d.statements.extend(ts2.triples())
                    else:
                        d["statements"] = list(ts2.triples())
        if k in ("statements", "mappings"):
            for i, spo in enumerate(d[k]):
                d[k][i] = [
                    (
                        get(d, e, e)[0]
                        if e.startswith("@")
                        else expand_iri(e, prefixes=all_prefixes)
                    )
                    for e in spo
                ]
        elif isinstance(v, str):
            d[k] = expand_iri(v, all_prefixes)
        elif isinstance(v, list):
            for i, e in enumerate(v):
                if isinstance(e, str):
                    v[i] = expand_iri(e, all_prefixes)
                elif isinstance(e, dict) and k in nested:
                    v[i] = as_jsonld(
                        e, k, _entryid=_entryid, prefixes=prefixes
                    )
        elif isinstance(v, dict) and k in nested:
            d[k] = as_jsonld(v, k, _entryid=_entryid, prefixes=prefixes)

    return d

`expand_iri(iri, prefixes)` ¶

Return the full IRI if iri is prefixed. Otherwise iri is returned.

Source code in tripper/dataset/dataset.py

def expand_iri(iri: str, prefixes: dict) -> str:
    """Return the full IRI if `iri` is prefixed.  Otherwise `iri` is
    returned."""
    match = re.match(MATCH_PREFIXED_IRI, iri)
    if match:
        prefix, name, _ = match.groups()
        if prefix in prefixes:
            return f"{prefixes[prefix]}{name}"
        warnings.warn(f'Undefined prefix "{prefix}" in IRI: {iri}')
    return iri

`get(d, key, default=None, aslist=True)` ¶

Like d.get(key, default) but returns the value as a list if aslist is True and value is not already a list.

An empty list is returned in the special case that key is not in d and default is None.

Source code in tripper/dataset/dataset.py

def get(
    d: dict, key: str, default: "Any" = None, aslist: bool = True
) -> "Any":
    """Like `d.get(key, default)` but returns the value as a list if
    `aslist` is True and value is not already a list.

    An empty list is returned in the special case that `key` is not in
    `d` and `default` is None.

    """
    value = d.get(key, default)
    if aslist:
        return (
            value
            if isinstance(value, list)
            else [] if value is None else [value]
        )
    return value

`get_jsonld_context(context=None, timeout=5, fromfile=True)` ¶

Returns the JSON-LD context as a dict.

The JSON-LD context maps all the keywords that can be used as keys in the dict-representation of a dataset to properties defined in common vocabularies and ontologies.

Parameters:

Name	Type	Description	Default
`context`	`Optional[Union[str, dict, Sequence[Union[str, dict]]]]`	Additional user-defined context that should be returned on top of the default context. It may be a string with an URL to the user-defined context, a dict with the user-defined context or a sequence of strings and dicts.	`None`
`timeout`	`float`	Number of seconds before timing out.	`5`
`fromfile`	`bool`	Whether to load the context from local file.	`True`

Source code in tripper/dataset/dataset.py

def get_jsonld_context(
    context: "Optional[Union[str, dict, Sequence[Union[str, dict]]]]" = None,
    timeout: float = 5,
    fromfile: bool = True,
) -> dict:
    """Returns the JSON-LD context as a dict.

    The JSON-LD context maps all the keywords that can be used as keys
    in the dict-representation of a dataset to properties defined in
    common vocabularies and ontologies.

    Arguments:
        context: Additional user-defined context that should be returned
            on top of the default context.  It may be a string with an URL
            to the user-defined context, a dict with the user-defined context
            or a sequence of strings and dicts.
        timeout: Number of seconds before timing out.
        fromfile: Whether to load the context from local file.

    """
    import requests

    if fromfile:
        with open(CONTEXT_PATH[7:], "r", encoding="utf-8") as f:
            ctx = json.load(f)["@context"]
    else:
        r = requests.get(CONTEXT_URL, allow_redirects=True, timeout=timeout)
        ctx = json.loads(r.content)["@context"]

    if isinstance(context, (str, dict)):
        context = [context]

    if context:
        for token in context:
            if isinstance(token, str):
                r = requests.get(token, allow_redirects=True, timeout=timeout)
                ctx.update(json.loads(r.content)["@context"])
            elif isinstance(token, dict):
                ctx.update(token)
            else:
                raise TypeError(
                    "`context` must be a string (URL), dict or a sequence of "
                    f"strings and dicts.  Not '{type(token)}'"
                )

    return ctx

`get_partial_pipeline(ts, client, iri, parser=None, generator=None, distribution=None, use_sparql=None)` ¶

Returns a OTELib partial pipeline.

Parameters:

Name	Type	Description	Default
`ts`	`Triplestore`	Triplestore to load data from.	required
`client`		OTELib client to create pipeline with.	required
`iri`	`str`	IRI of the dataset to load.	required
`parser`	`Optional[Union[bool, str]]`	Whether to return a datasource partial pipeline. Should be True or an IRI of parser to use in case the distribution has multiple parsers. By default the first parser will be selected.	`None`
`generator`	`Optional[Union[bool, str]]`	Whether to return a datasink partial pipeline. Should be True or an IRI of generator to use in case the distribution has multiple generators. By default the first generator will be selected.	`None`
`distribution`	`Optional[str]`	IRI of distribution to use in case the dataset dataset has multiple distributions. By default any of the distributions will be picked.	`None`
`use_sparql`	`Optional[bool]`	Whether to access the triplestore with SPARQL. Defaults to `ts.prefer_sparql`.	`None`

Returns:

Type	Description
`bytes`	OTELib partial pipeline.

Source code in tripper/dataset/dataset.py

def get_partial_pipeline(
    ts: Triplestore,
    client,
    iri: str,
    parser: "Optional[Union[bool, str]]" = None,
    generator: "Optional[Union[bool, str]]" = None,
    distribution: "Optional[str]" = None,
    use_sparql: "Optional[bool]" = None,
) -> bytes:
    """Returns a OTELib partial pipeline.

    Arguments:
        ts: Triplestore to load data from.
        client: OTELib client to create pipeline with.
        iri: IRI of the dataset to load.
        parser: Whether to return a datasource partial pipeline.
            Should be True or an IRI of parser to use in case the
            distribution has multiple parsers.  By default the first
            parser will be selected.
        generator: Whether to return a datasink partial pipeline.
            Should be True or an IRI of generator to use in case the
            distribution has multiple generators.  By default the first
            generator will be selected.
        distribution: IRI of distribution to use in case the dataset
            dataset has multiple distributions.  By default any of
            the distributions will be picked.
        use_sparql: Whether to access the triplestore with SPARQL.
            Defaults to `ts.prefer_sparql`.

    Returns:
        OTELib partial pipeline.
    """
    # pylint: disable=too-many-branches
    dct = load_dict(ts, iri, use_sparql=use_sparql)

    if isinstance(distribution, str):
        for distr in get(dct, "distribution"):
            if distr["@id"] == distribution:
                break
        else:
            raise ValueError(
                f"dataset '{iri}' has no such distribution: {distribution}"
            )
    else:
        distr = get(dct, "distribution")[0]

    accessService = (
        distr.accessService.get("endpointURL")
        if "accessService" in distr
        else None
    )

    # OTEAPI still puts the parse configurations into the dataresource
    # instead of a in a separate parse strategy...
    if parser:
        if parser is True:
            par = get(distr, "parser")[0]
        elif isinstance(parser, str):
            for par in get(distr, "parser"):
                if par.get("@id") == parser:
                    break
            else:
                raise ValueError(
                    f"dataset '{iri}' has no such parser: {parser}"
                )
        configuration = par.get("configuration")
    else:
        configuration = None

    dataresource = client.create_dataresource(
        downloadUrl=distr.get("downloadURL"),
        mediaType=distr.get("mediaType"),
        accessUrl=distr.get("accessURL"),
        accessService=accessService,
        configuration=dict(configuration) if configuration else {},
    )

    statements = dct.get("statements", [])
    statements.extend(dct.get("mappings", []))
    if statements:
        mapping = client.create_mapping(
            mappingType="triples",
            # The OTEAPI datamodels stupidly strict, requireing us
            # to cast the data ts.namespaces and statements
            prefixes={k: str(v) for k, v in ts.namespaces.items()},
            triples=[tuple(t) for t in statements],
        )

    if parser:
        pipeline = dataresource
        if statements:
            pipeline = pipeline >> mapping
    elif generator:
        if generator is True:
            gen = get(distr, "generator")[0]
        elif isinstance(generator, str):
            for gen in get(distr, "generator"):
                if gen.get("@id") == generator:
                    break
            else:
                raise ValueError(
                    f"dataset '{iri}' has no such generator: {generator}"
                )

        conf = gen.get("configuration")
        if gen.generatorType == "application/vnd.dlite-generate":
            conf.setdefault("datamodel", dct.get("datamodel"))

        function = client.create_function(
            functionType=gen.generatorType,
            configuration=conf,
        )
        if statements:
            pipeline = mapping >> function >> dataresource
        else:
            pipeline = function >> dataresource

    return pipeline

`get_prefixes(context=None, timeout=5, fromfile=True)` ¶

Loads the JSON-LD context and returns a dict mapping prefixes to their namespace URL.

Arguments are passed to get_jsonld_context().

Source code in tripper/dataset/dataset.py

def get_prefixes(
    context: "Optional[Union[str, dict, Sequence[Union[str, dict]]]]" = None,
    timeout: float = 5,
    fromfile: bool = True,
) -> dict:
    """Loads the JSON-LD context and returns a dict mapping prefixes to
    their namespace URL.

    Arguments are passed to `get_jsonld_context()`.
    """
    ctx = get_jsonld_context(
        context=context, timeout=timeout, fromfile=fromfile
    )
    prefixes = {
        k: str(v)
        for k, v in ctx.items()
        if isinstance(v, (str, Namespace)) and str(v).endswith(("#", "/"))
    }
    return prefixes

`get_shortnames(context=None, timeout=5, fromfile=True)` ¶

Loads the JSON-LD context and returns a dict mapping IRIs to their short names defined in the context.

Arguments are passed to get_jsonld_context().

Source code in tripper/dataset/dataset.py

def get_shortnames(
    context: "Optional[Union[str, dict, Sequence[Union[str, dict]]]]" = None,
    timeout: float = 5,
    fromfile: bool = True,
) -> dict:
    """Loads the JSON-LD context and returns a dict mapping IRIs to their
    short names defined in the context.

    Arguments are passed to `get_jsonld_context()`.
    """
    ctx = get_jsonld_context(
        context=context, timeout=timeout, fromfile=fromfile
    )
    prefixes = get_prefixes(context=ctx)
    shortnames = {
        expand_iri(v["@id"] if isinstance(v, dict) else v, prefixes): k
        for k, v in ctx.items()
        if (
            (isinstance(v, str) and not v.endswith(("#", "/")))
            or isinstance(v, dict)
        )
    }
    shortnames.setdefault(RDF.type, "@type")
    return shortnames

`get_values(data, key, extend=True)` ¶

Parse data recursively and return a list with the values corresponding to the given key.

If extend is true, the returned list will be extended with values that themselves are list, instead of appending them in a nested manner.

Source code in tripper/dataset/dataset.py

def get_values(
    data: "Union[dict, list]", key: str, extend: bool = True
) -> list:
    """Parse `data` recursively and return a list with the values
    corresponding to the given key.

    If `extend` is true, the returned list will be extended with
    values that themselves are list, instead of appending them in a
    nested manner.

    """
    values = []
    if isinstance(data, dict):
        val = data.get(key)
        if extend and isinstance(val, list):
            values.extend(val)
        elif val:
            values.append(val)
        for v in data.values():
            if isinstance(v, (dict, list)):
                values.extend(get_values(v, key))
    elif isinstance(data, list):
        for ele in data:
            if isinstance(ele, (dict, list)):
                values.extend(get_values(ele, key))
    return values

`load_dict(ts, iri, use_sparql=None)` ¶

Load dict representation of data with given IRI from the triplestore.

Parameters:

Name	Type	Description	Default
`ts`	`Triplestore`	Triplestore to load data from.	required
`iri`	`str`	IRI of the data to load.	required
`use_sparql`	`Optional[bool]`	Whether to access the triplestore with SPARQL. Defaults to `ts.prefer_sparql`.	`None`

Returns:

Type	Description
`dict`	Dict-representation of the loaded data.

Source code in tripper/dataset/dataset.py

def load_dict(
    ts: Triplestore, iri: str, use_sparql: "Optional[bool]" = None
) -> dict:
    """Load dict representation of data with given IRI from the triplestore.

    Arguments:
        ts: Triplestore to load data from.
        iri: IRI of the data to load.
        use_sparql: Whether to access the triplestore with SPARQL.
            Defaults to `ts.prefer_sparql`.

    Returns:
        Dict-representation of the loaded data.
    """
    if use_sparql is None:
        use_sparql = ts.prefer_sparql
    if use_sparql:
        return _load_sparql(ts, iri)

    nested = dicttypes.keys()
    d = AttrDict()
    dct = _load_triples(ts, iri)

    for k, v in dct.items():
        if k in nested:
            if not isinstance(v, list):
                v = [v]
            for vv in v:
                d[k] = load_dict(ts, iri=vv, use_sparql=use_sparql)
                add(d[k], "@type", dicttypes[k]["@type"])
        else:
            d[k] = v
    return d

`load_list(ts, iri)` ¶

Load and return RDF list whose first node is iri.

Source code in tripper/dataset/dataset.py

def load_list(ts: Triplestore, iri: str):
    """Load and return RDF list whose first node is `iri`."""
    lst = []
    for p, o in ts.predicate_objects(iri):
        if p == RDF.first:
            lst.append(o)
        elif p == RDF.rest:
            lst.extend(load_list(ts, o))
    return lst

`prepare_datadoc(datadoc)` ¶

Return an updated version of dict datadoc that is prepared with additional key-value pairs needed for creating valid JSON-LD that can be serialised to RDF.

Source code in tripper/dataset/dataset.py

def prepare_datadoc(datadoc: dict) -> dict:
    """Return an updated version of dict `datadoc` that is prepared with
    additional key-value pairs needed for creating valid JSON-LD that
    can be serialised to RDF.

    """
    d = AttrDict({"@context": CONTEXT_URL})
    d.update(datadoc)

    context = datadoc.get("@context")
    prefixes = get_prefixes(context=context)
    if "prefixes" in d:
        d.prefixes.update(prefixes)
    else:
        d.prefixes = prefixes.copy()

    for type, spec in dicttypes.items():
        label = spec["datadoc_label"]
        for i, dct in enumerate(get(d, label)):
            d[label][i] = as_jsonld(
                dct=dct, type=type, prefixes=d.prefixes, _context=context
            )

    return d

`read_datadoc(filename)` ¶

Read YAML data documentation and return it as a dict.

The filename may also be an URL to a file accessible with HTTP GET.

Source code in tripper/dataset/dataset.py

def read_datadoc(filename: "Union[str, Path]") -> dict:
    """Read YAML data documentation and return it as a dict.

    The filename may also be an URL to a file accessible with HTTP GET.
    """
    import yaml  # type: ignore

    with openfile(filename, mode="rt", encoding="utf-8") as f:
        d = yaml.safe_load(f)
    return prepare_datadoc(d)

`save_datadoc(ts, file_or_dict)` ¶

Populate triplestore with data documentation.

Parameters:

Name	Type	Description	Default
`ts`	`Triplestore`	Triplestore to save dataset documentation to.	required
`file_or_dict`	`Union[str, Path, dict]`	Data documentation dict or name of a YAML file to read the data documentation from. It may also be an URL to a file accessible with HTTP GET.	required

Returns:

Type	Description
`dict`	Dict-representation of the loaded dataset.

Source code in tripper/dataset/dataset.py

def save_datadoc(
    ts: Triplestore, file_or_dict: "Union[str, Path, dict]"
) -> dict:
    """Populate triplestore with data documentation.

    Arguments:
        ts: Triplestore to save dataset documentation to.
        file_or_dict: Data documentation dict or name of a YAML file to read
            the data documentation from.  It may also be an URL to a file
            accessible with HTTP GET.

    Returns:
        Dict-representation of the loaded dataset.
    """
    if isinstance(file_or_dict, dict):
        d = prepare_datadoc(file_or_dict)
    else:
        d = read_datadoc(file_or_dict)

    # Bind prefixes
    context = d.get("@context")
    prefixes = get_prefixes(context=context)
    prefixes.update(d.get("prefixes", {}))
    for prefix, ns in prefixes.items():  # type: ignore
        ts.bind(prefix, ns)

    # Maps datadoc_labels to type
    types = {v["datadoc_label"]: k for k, v in dicttypes.items()}

    # Write json-ld data to triplestore (using temporary rdflib triplestore)
    for spec in dicttypes.values():
        label = spec["datadoc_label"]
        for dct in get(d, label):
            dct = as_jsonld(
                dct=dct, type=types[label], prefixes=prefixes, _context=context
            )
            f = io.StringIO(json.dumps(dct))
            with Triplestore(backend="rdflib") as ts2:
                ts2.parse(f, format="json-ld")
                ts.add_triples(ts2.triples())

    # Add statements and datamodels to triplestore
    save_extra_content(ts, d)
    return d

`save_dict(ts, dct, type='dataset', prefixes=None, **kwargs)` ¶

Save a dict representation of given type of data to a triplestore.

Parameters:

Name	Type	Description	Default
`ts`	`Triplestore`	Triplestore to save to.	required
`dct`	`dict`	Dict with data to save.	required
`type`	`str`	Type of data to save. Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset".	`'dataset'`
`prefixes`	`Optional[dict]`	Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs.	`None`
`kwargs`		Additional keyword arguments to add to the returned dict. A leading underscore in a key will be translated to a leading "@"-sign. For example, "@id=..." may be provided as "_id=...".	`{}`

Returns:

Type	Description
`dict`	An updated copy of `dct`.

Notes

The keys in dct and kwargs may be either properties defined in the JSON-LD context or one of the following special keywords:

"@id": Dataset IRI. Must always be given.
"@type": IRI of the ontology class for this type of data. For datasets, it is typically used to refer to a specific subclass of emmo:DataSet that provides a semantic description of this dataset.

References:

Source code in tripper/dataset/dataset.py

def save_dict(
    ts: Triplestore,
    dct: dict,
    type: str = "dataset",
    prefixes: "Optional[dict]" = None,
    **kwargs,
) -> dict:
    # pylint: disable=line-too-long,too-many-branches
    """Save a dict representation of given type of data to a triplestore.

    Arguments:
        ts: Triplestore to save to.
        dct: Dict with data to save.
        type: Type of data to save.  Should either be one of the
            pre-defined names: "dataset", "distribution", "accessService",
            "parser" and "generator" or an IRI to a class in an ontology.
            Defaults to "dataset".
        prefixes: Dict with prefixes in addition to those included in the
            JSON-LD context.  Should map namespace prefixes to IRIs.
        kwargs: Additional keyword arguments to add to the returned dict.
            A leading underscore in a key will be translated to a
            leading "@"-sign.  For example, "@id=..." may be provided
            as "_id=...".

    Returns:
        An updated copy of `dct`.

    Notes:
        The keys in `dct` and `kwargs` may be either properties defined in the
        [JSON-LD context] or one of the following special keywords:

          - "@id": Dataset IRI.  Must always be given.
          - "@type": IRI of the ontology class for this type of data.
            For datasets, it is typically used to refer to a specific subclass
            of `emmo:DataSet` that provides a semantic description of this
            dataset.

    References:
    [JSON-LD context]: https://raw.githubusercontent.com/EMMC-ASBL/oteapi-dlite/refs/heads/rdf-serialisation/oteapi_dlite/context/0.2/context.json
    """
    if "@id" not in dct:
        raise ValueError("`dct` must have an '@id' key")

    all_prefixes = get_prefixes()
    if prefixes:
        all_prefixes.update(prefixes)

    d = as_jsonld(dct=dct, type=type, prefixes=all_prefixes, **kwargs)

    # Bind prefixes
    for prefix, ns in all_prefixes.items():
        ts.bind(prefix, ns)

    # Write json-ld data to triplestore (using temporary rdflib triplestore)
    f = io.StringIO(json.dumps(d))
    with Triplestore(backend="rdflib") as ts2:
        ts2.parse(f, format="json-ld")
        ts.add_triples(ts2.triples())

    # Add statements and data models to triplestore
    save_extra_content(ts, d)

    return d

`save_extra_content(ts, dct)` ¶

Save extra content in dct to the triplestore.

Currently, this includes: - statements and mappings - data models (require that DLite is installed)

Source code in tripper/dataset/dataset.py

def save_extra_content(ts: Triplestore, dct: dict) -> None:
    """Save extra content in `dct` to the triplestore.

    Currently, this includes:
    - statements and mappings
    - data models (require that DLite is installed)

    """
    import requests

    # Save statements and mappings
    statements = get_values(dct, "statements")
    statements.extend(get_values(dct, "mappings"))
    ts.add_triples(statements)

    # Save data models
    datamodels = get_values(dct, "datamodel")
    try:
        # pylint: disable=import-outside-toplevel
        import dlite
        from dlite.dataset import add_dataset
    except ModuleNotFoundError:
        if datamodels:
            warnings.warn(
                "dlite is not installed - data models will not be added to "
                "the triplestore"
            )
    else:
        for url in get_values(dct, "datamodelStorage"):
            dlite.storage_path.append(url)

        for uri in datamodels:
            r = requests.get(uri, timeout=3)
            if r.ok:
                content = (
                    r.content.decode()
                    if isinstance(r.content, bytes)
                    else str(r.content)
                )
                dm = dlite.Instance.from_json(content)
                add_dataset(ts, dm)
            else:
                try:
                    dm = dlite.get_instance(uri)
                except (
                    dlite.DLiteMissingInstanceError  # pylint: disable=no-member
                ):
                    # __FIXME__: check session whether to warn or re-raise
                    warnings.warn(f"cannot load datamodel: {uri}")
                else:
                    add_dataset(ts, dm)

`search_iris(ts, type='http://www.w3.org/ns/dcat#Dataset', **kwargs)` ¶

Return a list of IRIs for all entries of the given type. Additional matching criterias can be specified by kwargs.

Parameters:

Name	Type	Description	Default
`ts`	`Triplestore`	Triplestore to search.	required
`type`		Search for entries that are individuals of the class with this IRI. The default is `dcat:Dataset`.	`'http://www.w3.org/ns/dcat#Dataset'`
`kwargs`		Match criterias.	`{}`

Examples:

List all dataset IRIs:

search_iris(ts)

List IRIs of all datasets with John Doe as contactPoint:

search_iris(ts, contactPoint="John Doe")

List IRIs of all samples:

search_iris(ts, type=CHAMEO.Sample)

List IRIs of all datasets with John Doe as contactPoint AND are measured on a given sample:

search_iris(
    ts, contactPoint="John Doe", fromSample=SAMPLE.batch2/sample3
)

Source code in tripper/dataset/dataset.py

def search_iris(ts: Triplestore, type=DCAT.Dataset, **kwargs):
    """Return a list of IRIs for all entries of the given type.
    Additional matching criterias can be specified by `kwargs`.


    Arguments:
        ts: Triplestore to search.
        type: Search for entries that are individuals of the class with
            this IRI.  The default is `dcat:Dataset`.
        kwargs: Match criterias.

    Examples:
        List all dataset IRIs:

            search_iris(ts)

        List IRIs of all datasets with John Doe as `contactPoint`:

            search_iris(ts, contactPoint="John Doe")

        List IRIs of all samples:

            search_iris(ts, type=CHAMEO.Sample)

        List IRIs of all datasets with John Doe as `contactPoint` AND are
        measured on a given sample:

            search_iris(
                ts, contactPoint="John Doe", fromSample=SAMPLE.batch2/sample3
            )
    """
    crit = []

    if type:
        crit.append(f"  ?iri rdf:type <{type}> .")

    expanded = {v: k for k, v in get_shortnames().items()}
    for k, v in kwargs.items():
        key = f"@{k[1:]}" if k.startswith("_") else k
        predicate = expanded[key]
        if v in expanded:
            value = f"<{expanded[v]}>"
        elif isinstance(v, str):
            value = (
                f"<{v}>" if re.match("^[a-z][a-z0-9.+-]*://", v) else f'"{v}"'
            )
        else:
            value = v
        crit.append(f"  ?iri <{predicate}> {value} .")
    criterias = "\n".join(crit)
    query = f"""
    PREFIX rdf: <{RDF}>
    SELECT ?iri
    WHERE {{
    {criterias}
    }}
    """
    return [r[0] for r in ts.query(query)]  # type: ignore

dataset¶

add(d, key, value) ¶

addnested(d, key, value) ¶

as_jsonld(dct, type='dataset', prefixes=None, **kwargs) ¶

expand_iri(iri, prefixes) ¶

get(d, key, default=None, aslist=True) ¶

get_jsonld_context(context=None, timeout=5, fromfile=True) ¶

get_partial_pipeline(ts, client, iri, parser=None, generator=None, distribution=None, use_sparql=None) ¶

get_prefixes(context=None, timeout=5, fromfile=True) ¶

get_shortnames(context=None, timeout=5, fromfile=True) ¶

get_values(data, key, extend=True) ¶

load_dict(ts, iri, use_sparql=None) ¶

load_list(ts, iri) ¶

prepare_datadoc(datadoc) ¶

read_datadoc(filename) ¶

save_datadoc(ts, file_or_dict) ¶

save_dict(ts, dct, type='dataset', prefixes=None, **kwargs) ¶

save_extra_content(ts, dct) ¶

search_iris(ts, type='http://www.w3.org/ns/dcat#Dataset', **kwargs) ¶

`add(d, key, value)` ¶

`addnested(d, key, value)` ¶

`as_jsonld(dct, type='dataset', prefixes=None, **kwargs)` ¶

`expand_iri(iri, prefixes)` ¶

`get(d, key, default=None, aslist=True)` ¶

`get_jsonld_context(context=None, timeout=5, fromfile=True)` ¶

`get_partial_pipeline(ts, client, iri, parser=None, generator=None, distribution=None, use_sparql=None)` ¶

`get_prefixes(context=None, timeout=5, fromfile=True)` ¶

`get_shortnames(context=None, timeout=5, fromfile=True)` ¶

`get_values(data, key, extend=True)` ¶

`load_dict(ts, iri, use_sparql=None)` ¶

`load_list(ts, iri)` ¶

`prepare_datadoc(datadoc)` ¶

`read_datadoc(filename)` ¶

`save_datadoc(ts, file_or_dict)` ¶

`save_dict(ts, dct, type='dataset', prefixes=None, **kwargs)` ¶

`save_extra_content(ts, dct)` ¶

`search_iris(ts, type='http://www.w3.org/ns/dcat#Dataset', **kwargs)` ¶