Skip to content

datadoc

A script for data documentation.

main(argv=None)

Main function.

Source code in tripper/dataset/datadoc.py
def main(argv=None):
    """Main function."""
    parser = argparse.ArgumentParser(
        description=(
            "Tool for data documentation.\n\n"
            "It allows populating and searching a triplestore for existing "
            "documentation."
        ),
    )

    subparsers = parser.add_subparsers(required=True, help="Subcommands:")

    # Subcommand: add
    parser_add = subparsers.add_parser(
        "add",
        help="Populate the triplestore with data documentation.",
    )
    parser_add.set_defaults(func=subcommand_add)
    parser_add.add_argument(
        "input",
        help=(
            "Path or URL to the input the triplestore should be populated "
            "from."
        ),
    )
    parser_add.add_argument(
        "--input-format",
        "-i",
        choices=["yaml", "csv"],
        help=(
            "Input format. By default it is inferred from the file "
            "extension of the `input` argument."
        ),
    )
    parser_add.add_argument(
        "--csv-options",
        action="extend",
        nargs="+",
        metavar="OPTION=VALUE",
        help=(
            "Options describing the CSV dialect for --input-format=csv. "
            "Common options are 'dialect', 'delimiter' and 'quotechar'."
        ),
    )
    parser_add.add_argument(
        "--context",
        help="Path or URL to custom JSON-LD context for the input.",
    )
    parser_add.add_argument(
        "--dump",
        "-d",
        metavar="FILENAME",
        help="File to dump the populated triplestore to.",
    )
    parser_add.add_argument(
        "--format",
        "-f",
        default="turtle",
        help='Format to use with `--dump`.  Default is "turtle".',
    )

    # Subcommand: find
    parser_find = subparsers.add_parser(
        "find", help="Find documented resources in the triplestore."
    )
    parser_find.set_defaults(func=subcommand_find)
    parser_find.add_argument(
        "--type",
        "-t",
        help=(
            'Either a resource type (ex: "dataset", "distribution", ...) '
            "or the IRI of a class to limit the search to."
        ),
    )
    parser_find.add_argument(
        "--criteria",
        "-c",
        action="extend",
        nargs="+",
        metavar="KEYWORD=VALUE",
        help=(
            "One of more additional matching criteria for resources to find. "
            "Only resources with the given KEYWORD and VALUE will be matched. "
            "The match is exact."
        ),
    )
    parser_find.add_argument(
        "--output",
        "-o",
        metavar="FILENAME",
        help=(
            "Write matching output to the given file. The default is to "
            "write to standard output."
        ),
    )
    parser_find.add_argument(
        "--format",
        "-f",
        default="iris",
        choices=["iris", "json", "turtle", "csv"],
        help=(
            "Output format to list the matched resources. The default is "
            "to infer from the file extension if --output is given. "
            'Otherwise it defaults to "iris".'
        ),
    )

    # Subcommand: load
    parser_load = subparsers.add_parser(
        "load", help="Load documented dataset from a storage."
    )
    parser_load.set_defaults(func=subcommand_load)
    parser_load.add_argument(
        "iri",
        help="IRI of dataset to load.",
    )
    parser_load.add_argument(
        "--output",
        "-o",
        metavar="FILENAME",
        help=(
            "Write the dataset to the given file. The default is to write "
            "to standard output."
        ),
    )

    # General: options
    parser.add_argument(
        "--backend",
        "-b",
        default="rdflib",
        help=(
            'Triplestore backend to use. Defaults to "rdflib" - an '
            "in-memory rdflib triplestore, that can be pre-loaded with "
            "--parse."
        ),
    )
    parser.add_argument(
        "--base-iri",
        "-B",
        help="Base IRI of the triplestore.",
    )
    parser.add_argument(
        "--database",
        "-d",
        help="Name of database to connect to (for backends supporting it).",
    )
    parser.add_argument(
        "--package",
        help="Only needed when `backend` is a relative module.",
    )
    parser.add_argument(
        "--parse",
        "-p",
        metavar="LOCATION",
        help="Load triplestore from this location.",
    )
    parser.add_argument(
        "--parse-format",
        "-F",
        help="Used with `--parse`. Format to use when parsing triplestore.",
    )
    parser.add_argument(
        "--prefixes",
        "-P",
        action="extend",
        nargs="+",
        metavar="PREFIX=URL",
        help="Namespace prefixes to bind to the triplestore.",
    )

    args = parser.parse_args(argv)

    ts = Triplestore(
        backend=args.backend,
        base_iri=args.base_iri,
        database=args.database,
        package=args.package,
    )
    if args.parse:
        ts.parse(args.parse, format=args.parse_format)

    if args.prefixes:
        for token in args.prefixes:
            prefix, ns = token.split("=", 1)
            ts.bind(prefix, ns)

    # Call subcommand handler
    args.func(ts, args)

subcommand_add(ts, args)

Subcommand for populating the triplestore

Source code in tripper/dataset/datadoc.py
def subcommand_add(ts, args):
    """Subcommand for populating the triplestore"""
    infile = Path(args.input)
    extension = args.input_format if args.input_format else infile.suffix
    fmt = extension.lower().lstrip(".")

    if fmt in ("yml", "yaml"):
        save_datadoc(ts, infile)
    elif fmt in ("csv",):
        kw = {}
        if args.csv_options:
            for token in args.csv_options:
                option, value = token.split("=", 1)
                kw[option] = value
        td = TableDoc.parse_csv(
            infile, context=get_jsonld_context(args.context), **kw
        )
        td.save(ts)
    else:
        raise ValueError(f"Unknown input format: {fmt}")

    if args.dump:
        ts.serialize(args.dump, format=args.format)

subcommand_find(ts, args)

Subcommand for finding IRIs in the triplestore.

Source code in tripper/dataset/datadoc.py
def subcommand_find(ts, args):
    """Subcommand for finding IRIs in the triplestore."""
    if args.criteria:
        kwargs = dict(crit.split("=", 1) for crit in args.criteria)
    else:
        kwargs = {}
    iris = search_iris(ts, type=args.type, **kwargs)

    # Infer format
    if args.format:
        fmt = args.format.lower()
    elif args.output:
        fmt = Path(args.output).suffix.lower().lstrip(".")
    else:
        fmt = "iris"

    # Create output
    if fmt in ("iris", "txt"):
        s = "\n".join(iris)
    elif fmt == "json":
        s = json.dumps([load_dict(ts, iri) for iri in iris], indent=2)
    elif fmt in ("turtle", "ttl"):
        ts2 = Triplestore("rdflib")
        for iri in iris:
            d = load_dict(ts, iri)
            save_dict(ts2, d)
        s = ts2.serialize()
    elif fmt == "csv":
        dicts = [load_dict(ts, iri) for iri in iris]
        td = TableDoc.fromdicts(dicts)
        with io.StringIO() as f:
            td.write_csv(f)
            s = f.getvalue()
    else:
        raise ValueError(f"Unknown format: {fmt}")

    if args.output:
        with open(args.output, "wt", encoding="utf-8") as f:
            f.write(s + os.linesep)
    else:
        print(s)

subcommand_load(ts, args)

Subcommand for loading a documented dataset from a storage.

Source code in tripper/dataset/datadoc.py
def subcommand_load(ts, args):
    """Subcommand for loading a documented dataset from a storage."""
    data = load(ts, args.iri)

    if args.output:
        with open(args.output, "wb") as f:
            f.write(data)
    else:
        print(data)