Skip to content

tabledoc

Basic interface for tabular documentation of datasets.

TableDoc

Representation of tabular documentation of datasets.

Parameters:

Name Type Description Default
header Sequence[str]

Sequence of column header labels. Nested data can be represented by dot-separated label strings (e.g. "distribution.downloadURL")

required
data Sequence[Sequence[str]]

Sequence of rows of data. Each row documents an entry.

required
type Optional[str]

Type of data to save (applies to all rows). Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset".

'Dataset'
prefixes Optional[dict]

Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs.

None
context Optional[ContextType]

Additional user-defined context that should be returned on top of the default context. It may be a string with an URL to the user-defined context, a dict with the user-defined context or a sequence of strings and dicts.

None
strip bool

Whether to strip leading and trailing whitespaces from cells.

True
Source code in tripper/datadoc/tabledoc.py
class TableDoc:
    """Representation of tabular documentation of datasets.

    Arguments:
        header: Sequence of column header labels.  Nested data can
            be represented by dot-separated label strings (e.g.
            "distribution.downloadURL")
        data: Sequence of rows of data. Each row documents an entry.
        type: Type of data to save (applies to all rows).  Should
            either be one of the pre-defined names: "dataset",
            "distribution", "accessService", "parser" and "generator"
            or an IRI to a class in an ontology.  Defaults to
            "dataset".
        prefixes: Dict with prefixes in addition to those included in the
            JSON-LD context.  Should map namespace prefixes to IRIs.
        context: Additional user-defined context that should be
            returned on top of the default context.  It may be a
            string with an URL to the user-defined context, a dict
            with the user-defined context or a sequence of strings and
            dicts.
        strip: Whether to strip leading and trailing whitespaces from cells.

    """

    # pylint: disable=redefined-builtin,too-few-public-methods

    def __init__(
        self,
        header: "Sequence[str]",
        data: "Sequence[Sequence[str]]",
        type: "Optional[str]" = "Dataset",
        context: "Optional[ContextType]" = None,
        prefixes: "Optional[dict]" = None,
        strip: bool = True,
    ):
        self.header = list(header)
        self.data = [list(row) for row in data]
        self.type = type
        self.context: Context = Context(context=context)
        ## self.prefixes = get_prefixes(context=context)
        if prefixes:
            ## self.prefixes.update(prefixes)
            self.context.add_context(prefixes)
        ## self.context = context if context else {}
        self.strip = strip

    def save(self, ts: Triplestore) -> None:
        """Save tabular datadocumentation to triplestore."""
        ## self.prefixes.update(ts.namespaces)

        self.context.add_context(
            {prefix: str(ns) for prefix, ns in ts.namespaces.items()}
        )

        for prefix, ns in self.context.get_prefixes().items():
            ts.bind(prefix, ns)

        store(ts, self.asdicts(), type=self.type, context=self.context)

    def asdicts(self) -> "List[dict]":
        """Return the table as a list of dicts."""
        results = []
        for row in self.data:
            d = AttrDict()
            for i, colname in enumerate(self.header):
                cell = row[i].strip() if row[i] and self.strip else row[i]
                if cell:
                    addnested(
                        d, colname.strip() if self.strip else colname, cell
                    )
            results.append(d)
        ld = told(
            results,
            type=self.type,
            prefixes=self.context.get_prefixes(),
            context=self.context,
        )
        dicts = ld["@graph"]
        return dicts

    @staticmethod
    def fromdicts(
        dicts: "Sequence[dict]",
        type: "Optional[str]" = "Dataset",
        prefixes: "Optional[dict]" = None,
        context: "Optional[Union[str, dict, list]]" = None,
        strip: bool = True,
    ) -> "TableDoc":
        """Create new TableDoc instance from a sequence of dicts.

        Arguments:
            dicts: Sequence of single-resource dicts.
            type: Type of data to save (applies to all rows).  Should
                either be one of the pre-defined names: "Dataset",
                "Distribution", "AccessService", "Parser" and
                "Generator" or an IRI to a class in an ontology.
                Defaults to "Dataset".
            prefixes: Dict with prefixes in addition to those included
                in the JSON-LD context.  Should map namespace prefixes
                to IRIs.
            context: Additional user-defined context that should be
                returned on top of the default context.  It may be a
                string with an URL to the user-defined context, a dict
                with the user-defined context or a sequence of strings
                and dicts.
            strip: Whether to strip leading and trailing whitespaces
                from cells.

        Returns:
            New TableDoc instance.

        """
        # Store the header as keys in a dict to keep ordering
        headdict = {"@id": True}

        def addheaddict(d, prefix=""):
            """Add keys in `d` to headdict.

            Nested dicts will result in dot-separated keys.
            """
            for k, v in d.items():
                if k == "@context":
                    pass
                elif isinstance(v, dict):
                    d = v.copy()
                    d.pop("@type", None)
                    addheaddict(d, k + ".")
                else:
                    headdict[prefix + k] = True

        # Assign the headdict
        for d in dicts:
            addheaddict(d)

        header = list(headdict)

        # Column multiplicity
        mult = [1] * len(header)

        # Assign table data. Nested dicts are accounted for
        data = []
        for dct in dicts:
            row = []
            for i, head in enumerate(header):
                if head in dct:
                    row.append(dct[head])
                else:
                    d = dct
                    for key in head.split("."):
                        d = d.get(key, {})
                    row.append(d if d != {} else None)
                if isinstance(row[-1], list):  # added value is a list
                    mult[i] = len(row[-1])  # update column multiplicity
            data.append(row)

        # Expand table with multiplicated columns
        if max(mult) > 1:
            exp_header = []
            for h, m in zip(header, mult):
                exp_header.extend([h] * m)

            exp_data = []
            for row in data:
                r = []
                for h, v in zip(header, row):
                    r.extend(v if isinstance(v, list) else [v])
                exp_data.append(r)
            header, data = exp_header, exp_data

        return TableDoc(
            header=header,
            data=data,
            type=type,
            prefixes=prefixes,
            context=context,
            strip=strip,
        )

    @staticmethod
    def parse_csv(
        csvfile: "Union[Iterable[str], Path, str]",
        type: "Optional[str]" = "Dataset",
        prefixes: "Optional[dict]" = None,
        context: "Optional[Union[dict, list]]" = None,
        encoding: str = "utf-8",
        dialect: "Optional[Union[csv.Dialect, str]]" = None,
        **kwargs,
    ) -> "TableDoc":
        # pylint: disable=line-too-long
        """Parse a csv file using the standard library csv module.

        Arguments:
            csvfile: Name of CSV file to parse or an iterable of strings.
            type: Type of data to save (applies to all rows).  Should
                either be one of the pre-defined names: "Dataset",
                "Distribution", "AccessService", "Parser" and "Generator"
                or an IRI to a class in an ontology.  Defaults to
                "Dataset".
            prefixes: Dict with prefixes in addition to those included in the
                JSON-LD context.  Should map namespace prefixes to IRIs.
            context: Dict with user-defined JSON-LD context.
            encoding: The encoding of the csv file.  Note that Excel may
                encode as "ISO-8859" (which was commonly used in the 1990th).
            dialect: A subclass of csv.Dialect, or the name of the dialect,
                specifying how the `csvfile` is formatted.  For more details,
                see [Dialects and Formatting Parameters].
            kwargs: Additional keyword arguments overriding individual
                formatting parameters.  For more details, see
                [Dialects and Formatting Parameters].

        Returns:
            New TableDoc instance.

        References:
        [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
        """

        def read(f, dialect):
            """Return csv reader from file-like object `f`."""
            if dialect is None and not kwargs:
                sample = f.read(1024)
                try:
                    dialect = csv.Sniffer().sniff(sample, delimiters=",;\t ")
                except csv.Error:
                    # The build-in sniffer not always work well with
                    # non-numerical csv files. Try our simple sniffer
                    dialect = csvsniff(sample)
                finally:
                    f.seek(0)
            reader = csv.reader(f, dialect=dialect, **kwargs)
            header = next(reader)
            data = list(reader)
            return header, data

        if isinstance(csvfile, (str, Path)):
            with openfile(csvfile, mode="rt", encoding=encoding) as f:
                header, data = read(f, dialect)
        else:
            header, data = read(csvfile, dialect)

        return TableDoc(
            header=header,
            data=data,
            type=type,
            prefixes=prefixes,
            context=context,
        )

    def write_csv(
        self,
        csvfile: "Union[Path, str, Writer]",
        encoding: str = "utf-8",
        dialect: "Union[csv.Dialect, str]" = "excel",
        prefixes: "Optional[dict]" = None,
        **kwargs,
    ) -> None:
        # pylint: disable=line-too-long
        """Write the table to a csv file using the standard library csv module.

        Arguments:
            csvfile: File-like object or name of CSV file to write.
            encoding: The encoding of the csv file.
            dialect: A subclass of csv.Dialect, or the name of the dialect,
                specifying how the `csvfile` is formatted.  For more details,
                see [Dialects and Formatting Parameters].
            prefixes: Prefixes used to compact the header.
            kwargs: Additional keyword arguments overriding individual
                formatting parameters.  For more details, see
                [Dialects and Formatting Parameters].

        References:
        [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
        """

        def write(f):
            writer = csv.writer(f, dialect=dialect, **kwargs)

            # TODO: use self.context and compact to shortnames
            if prefixes:
                header = []
                for h in self.header:
                    for prefix, ns in prefixes.items():
                        if h.startswith(str(ns)):
                            header.append(f"{prefix}:{h[len(str(ns)):]}")
                            break
                    else:
                        header.append(h)
                writer.writerow(header)
            else:
                writer.writerow(self.header)

            for row in self.data:
                writer.writerow(row)

        if isinstance(csvfile, (str, Path)):
            with open(csvfile, mode="wt", encoding=encoding) as f:
                write(f)
        else:
            write(csvfile)

asdicts(self)

Return the table as a list of dicts.

Source code in tripper/datadoc/tabledoc.py
def asdicts(self) -> "List[dict]":
    """Return the table as a list of dicts."""
    results = []
    for row in self.data:
        d = AttrDict()
        for i, colname in enumerate(self.header):
            cell = row[i].strip() if row[i] and self.strip else row[i]
            if cell:
                addnested(
                    d, colname.strip() if self.strip else colname, cell
                )
        results.append(d)
    ld = told(
        results,
        type=self.type,
        prefixes=self.context.get_prefixes(),
        context=self.context,
    )
    dicts = ld["@graph"]
    return dicts

fromdicts(dicts, type='Dataset', prefixes=None, context=None, strip=True) staticmethod

Create new TableDoc instance from a sequence of dicts.

Parameters:

Name Type Description Default
dicts Sequence[dict]

Sequence of single-resource dicts.

required
type Optional[str]

Type of data to save (applies to all rows). Should either be one of the pre-defined names: "Dataset", "Distribution", "AccessService", "Parser" and "Generator" or an IRI to a class in an ontology. Defaults to "Dataset".

'Dataset'
prefixes Optional[dict]

Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs.

None
context Optional[Union[str, dict, list]]

Additional user-defined context that should be returned on top of the default context. It may be a string with an URL to the user-defined context, a dict with the user-defined context or a sequence of strings and dicts.

None
strip bool

Whether to strip leading and trailing whitespaces from cells.

True

Returns:

Type Description
TableDoc

New TableDoc instance.

Source code in tripper/datadoc/tabledoc.py
@staticmethod
def fromdicts(
    dicts: "Sequence[dict]",
    type: "Optional[str]" = "Dataset",
    prefixes: "Optional[dict]" = None,
    context: "Optional[Union[str, dict, list]]" = None,
    strip: bool = True,
) -> "TableDoc":
    """Create new TableDoc instance from a sequence of dicts.

    Arguments:
        dicts: Sequence of single-resource dicts.
        type: Type of data to save (applies to all rows).  Should
            either be one of the pre-defined names: "Dataset",
            "Distribution", "AccessService", "Parser" and
            "Generator" or an IRI to a class in an ontology.
            Defaults to "Dataset".
        prefixes: Dict with prefixes in addition to those included
            in the JSON-LD context.  Should map namespace prefixes
            to IRIs.
        context: Additional user-defined context that should be
            returned on top of the default context.  It may be a
            string with an URL to the user-defined context, a dict
            with the user-defined context or a sequence of strings
            and dicts.
        strip: Whether to strip leading and trailing whitespaces
            from cells.

    Returns:
        New TableDoc instance.

    """
    # Store the header as keys in a dict to keep ordering
    headdict = {"@id": True}

    def addheaddict(d, prefix=""):
        """Add keys in `d` to headdict.

        Nested dicts will result in dot-separated keys.
        """
        for k, v in d.items():
            if k == "@context":
                pass
            elif isinstance(v, dict):
                d = v.copy()
                d.pop("@type", None)
                addheaddict(d, k + ".")
            else:
                headdict[prefix + k] = True

    # Assign the headdict
    for d in dicts:
        addheaddict(d)

    header = list(headdict)

    # Column multiplicity
    mult = [1] * len(header)

    # Assign table data. Nested dicts are accounted for
    data = []
    for dct in dicts:
        row = []
        for i, head in enumerate(header):
            if head in dct:
                row.append(dct[head])
            else:
                d = dct
                for key in head.split("."):
                    d = d.get(key, {})
                row.append(d if d != {} else None)
            if isinstance(row[-1], list):  # added value is a list
                mult[i] = len(row[-1])  # update column multiplicity
        data.append(row)

    # Expand table with multiplicated columns
    if max(mult) > 1:
        exp_header = []
        for h, m in zip(header, mult):
            exp_header.extend([h] * m)

        exp_data = []
        for row in data:
            r = []
            for h, v in zip(header, row):
                r.extend(v if isinstance(v, list) else [v])
            exp_data.append(r)
        header, data = exp_header, exp_data

    return TableDoc(
        header=header,
        data=data,
        type=type,
        prefixes=prefixes,
        context=context,
        strip=strip,
    )

parse_csv(csvfile, type='Dataset', prefixes=None, context=None, encoding='utf-8', dialect=None, **kwargs) staticmethod

Parse a csv file using the standard library csv module.

Parameters:

Name Type Description Default
csvfile Union[Iterable[str], Path, str]

Name of CSV file to parse or an iterable of strings.

required
type Optional[str]

Type of data to save (applies to all rows). Should either be one of the pre-defined names: "Dataset", "Distribution", "AccessService", "Parser" and "Generator" or an IRI to a class in an ontology. Defaults to "Dataset".

'Dataset'
prefixes Optional[dict]

Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs.

None
context Optional[Union[dict, list]]

Dict with user-defined JSON-LD context.

None
encoding str

The encoding of the csv file. Note that Excel may encode as "ISO-8859" (which was commonly used in the 1990th).

'utf-8'
dialect Optional[Union[csv.Dialect, str]]

A subclass of csv.Dialect, or the name of the dialect, specifying how the csvfile is formatted. For more details, see [Dialects and Formatting Parameters].

None
kwargs

Additional keyword arguments overriding individual formatting parameters. For more details, see [Dialects and Formatting Parameters].

{}

Returns:

Type Description
TableDoc

New TableDoc instance.

References:

Source code in tripper/datadoc/tabledoc.py
@staticmethod
def parse_csv(
    csvfile: "Union[Iterable[str], Path, str]",
    type: "Optional[str]" = "Dataset",
    prefixes: "Optional[dict]" = None,
    context: "Optional[Union[dict, list]]" = None,
    encoding: str = "utf-8",
    dialect: "Optional[Union[csv.Dialect, str]]" = None,
    **kwargs,
) -> "TableDoc":
    # pylint: disable=line-too-long
    """Parse a csv file using the standard library csv module.

    Arguments:
        csvfile: Name of CSV file to parse or an iterable of strings.
        type: Type of data to save (applies to all rows).  Should
            either be one of the pre-defined names: "Dataset",
            "Distribution", "AccessService", "Parser" and "Generator"
            or an IRI to a class in an ontology.  Defaults to
            "Dataset".
        prefixes: Dict with prefixes in addition to those included in the
            JSON-LD context.  Should map namespace prefixes to IRIs.
        context: Dict with user-defined JSON-LD context.
        encoding: The encoding of the csv file.  Note that Excel may
            encode as "ISO-8859" (which was commonly used in the 1990th).
        dialect: A subclass of csv.Dialect, or the name of the dialect,
            specifying how the `csvfile` is formatted.  For more details,
            see [Dialects and Formatting Parameters].
        kwargs: Additional keyword arguments overriding individual
            formatting parameters.  For more details, see
            [Dialects and Formatting Parameters].

    Returns:
        New TableDoc instance.

    References:
    [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
    """

    def read(f, dialect):
        """Return csv reader from file-like object `f`."""
        if dialect is None and not kwargs:
            sample = f.read(1024)
            try:
                dialect = csv.Sniffer().sniff(sample, delimiters=",;\t ")
            except csv.Error:
                # The build-in sniffer not always work well with
                # non-numerical csv files. Try our simple sniffer
                dialect = csvsniff(sample)
            finally:
                f.seek(0)
        reader = csv.reader(f, dialect=dialect, **kwargs)
        header = next(reader)
        data = list(reader)
        return header, data

    if isinstance(csvfile, (str, Path)):
        with openfile(csvfile, mode="rt", encoding=encoding) as f:
            header, data = read(f, dialect)
    else:
        header, data = read(csvfile, dialect)

    return TableDoc(
        header=header,
        data=data,
        type=type,
        prefixes=prefixes,
        context=context,
    )

save(self, ts)

Save tabular datadocumentation to triplestore.

Source code in tripper/datadoc/tabledoc.py
def save(self, ts: Triplestore) -> None:
    """Save tabular datadocumentation to triplestore."""
    ## self.prefixes.update(ts.namespaces)

    self.context.add_context(
        {prefix: str(ns) for prefix, ns in ts.namespaces.items()}
    )

    for prefix, ns in self.context.get_prefixes().items():
        ts.bind(prefix, ns)

    store(ts, self.asdicts(), type=self.type, context=self.context)

write_csv(self, csvfile, encoding='utf-8', dialect='excel', prefixes=None, **kwargs)

Write the table to a csv file using the standard library csv module.

Parameters:

Name Type Description Default
csvfile Union[Path, str, Writer]

File-like object or name of CSV file to write.

required
encoding str

The encoding of the csv file.

'utf-8'
dialect Union[csv.Dialect, str]

A subclass of csv.Dialect, or the name of the dialect, specifying how the csvfile is formatted. For more details, see [Dialects and Formatting Parameters].

'excel'
prefixes Optional[dict]

Prefixes used to compact the header.

None
kwargs

Additional keyword arguments overriding individual formatting parameters. For more details, see [Dialects and Formatting Parameters].

{}

References:

Source code in tripper/datadoc/tabledoc.py
def write_csv(
    self,
    csvfile: "Union[Path, str, Writer]",
    encoding: str = "utf-8",
    dialect: "Union[csv.Dialect, str]" = "excel",
    prefixes: "Optional[dict]" = None,
    **kwargs,
) -> None:
    # pylint: disable=line-too-long
    """Write the table to a csv file using the standard library csv module.

    Arguments:
        csvfile: File-like object or name of CSV file to write.
        encoding: The encoding of the csv file.
        dialect: A subclass of csv.Dialect, or the name of the dialect,
            specifying how the `csvfile` is formatted.  For more details,
            see [Dialects and Formatting Parameters].
        prefixes: Prefixes used to compact the header.
        kwargs: Additional keyword arguments overriding individual
            formatting parameters.  For more details, see
            [Dialects and Formatting Parameters].

    References:
    [Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
    """

    def write(f):
        writer = csv.writer(f, dialect=dialect, **kwargs)

        # TODO: use self.context and compact to shortnames
        if prefixes:
            header = []
            for h in self.header:
                for prefix, ns in prefixes.items():
                    if h.startswith(str(ns)):
                        header.append(f"{prefix}:{h[len(str(ns)):]}")
                        break
                else:
                    header.append(h)
            writer.writerow(header)
        else:
            writer.writerow(self.header)

        for row in self.data:
            writer.writerow(row)

    if isinstance(csvfile, (str, Path)):
        with open(csvfile, mode="wt", encoding=encoding) as f:
            write(f)
    else:
        write(csvfile)

csvsniff(sample)

Custom csv sniffer.

Analyse csv sample and returns a csv.Dialect instance.

Source code in tripper/datadoc/tabledoc.py
def csvsniff(sample):
    """Custom csv sniffer.

    Analyse csv sample and returns a csv.Dialect instance.
    """
    # Determine line terminator
    if "\r\n" in sample:
        linesep = "\r\n"
    else:
        counts = {s: sample.count(s) for s in "\n\r"}
        linesep = max(counts, key=lambda k: counts[k])

    lines = sample.split(linesep)
    del lines[-1]  # skip last line since it might be truncated
    if not lines:
        raise csv.Error(
            "too long csv header. No line terminator within sample"
        )
    header = lines[0]

    # Possible delimiters and quote chars to check
    delims = [d for d in ",;\t :" if header.count(d)]
    quotes = [q for q in "\"'" if sample.count(q)]
    if not quotes:
        quotes = ['"']

    # For each (quote, delim)-pair, count the number of tokens per line
    # Only pairs for which all lines has the same number of tokens are added
    # to ntokens
    ntokens = {}  # map (quote, delim) to number of tokens per line
    for q in quotes:
        for d in delims:
            ntok = []
            for ln in lines:
                # Remove quoted tokens
                ln = re.sub(f"(^{q}[^{q}]*{q}{d})|({d}{q}[^{q}]*{q}$)", d, ln)
                ln = re.sub(f"{d}{q}[^{q}]*{q}{d}", d * 2, ln)
                ntok.append(len(ln.split(d)))

            if ntok and max(ntok) == min(ntok):
                ntokens[(q, d)] = ntok[0]

    # From ntokens, select (quote, delim) pair that results in the highest
    # number of tokens per line
    if not ntokens:
        raise csv.Error("not able to determine delimiter")
    quote, delim = max(ntokens, key=lambda k: ntokens[k])

    class dialect(csv.Dialect):
        """Custom dialect."""

        # pylint: disable=too-few-public-methods
        _name = "sniffed"
        delimiter = delim
        doublequote = True  # quote chars inside quotes are duplicated
        # escapechar = "\\"  # unused
        lineterminator = linesep
        quotechar = quote
        quoting = csv.QUOTE_MINIMAL
        skipinitialspace = False  # don't ignore spaces before a delimiter
        strict = False  # be permissive on malformed csv input

    return dialect