tabledoc¶
Basic interface for tabular documentation of datasets.
TableDoc
¶
Representation of tabular documentation of datasets.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
header |
Sequence[str] |
Sequence of column header labels. Nested data can be represented by dot-separated label strings (e.g. "distribution.downloadURL") |
required |
data |
Sequence[Sequence[str]] |
Sequence of rows of data. Each row documents an entry. |
required |
type |
Optional[str] |
Type of data to save (applies to all rows). Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset". |
'dataset' |
prefixes |
Optional[dict] |
Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. |
None |
context |
Optional[Union[str, dict, list]] |
Additional user-defined context that should be returned on top of the default context. It may be a string with an URL to the user-defined context, a dict with the user-defined context or a sequence of strings and dicts. |
None |
strip |
bool |
Whether to strip leading and trailing whitespaces from cells. |
True |
Source code in tripper/dataset/tabledoc.py
class TableDoc:
"""Representation of tabular documentation of datasets.
Arguments:
header: Sequence of column header labels. Nested data can
be represented by dot-separated label strings (e.g.
"distribution.downloadURL")
data: Sequence of rows of data. Each row documents an entry.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and "generator"
or an IRI to a class in an ontology. Defaults to
"dataset".
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
context: Additional user-defined context that should be
returned on top of the default context. It may be a
string with an URL to the user-defined context, a dict
with the user-defined context or a sequence of strings and
dicts.
strip: Whether to strip leading and trailing whitespaces from cells.
"""
# pylint: disable=redefined-builtin,too-few-public-methods
def __init__(
self,
header: "Sequence[str]",
data: "Sequence[Sequence[str]]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[str, dict, list]]" = None,
strip: bool = True,
):
self.header = list(header)
self.data = [list(row) for row in data]
self.type = type
self.prefixes = prefixes
self.context = context
self.strip = strip
def save(self, ts: Triplestore) -> None:
"""Save tabular datadocumentation to triplestore."""
for d in self.asdicts():
save_dict(ts, d)
def asdicts(self) -> "List[dict]":
"""Return the table as a list of dicts."""
kw = {"_context": self.context} if self.context else {}
results = []
for row in self.data:
d = AttrDict()
for i, colname in enumerate(self.header):
cell = row[i].strip() if row[i] and self.strip else row[i]
if cell:
addnested(
d, colname.strip() if self.strip else colname, cell
)
jsonld = as_jsonld(
d, type=self.type, prefixes=self.prefixes, **kw # type: ignore
)
results.append(jsonld)
return results
@staticmethod
def fromdicts(
dicts: "Sequence[dict]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[str, dict, list]]" = None,
strip: bool = True,
) -> "TableDoc":
"""Create new TableDoc instance from a sequence of dicts.
Arguments:
dicts: Sequence of single-resource dicts.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and
"generator" or an IRI to a class in an ontology.
Defaults to "dataset".
prefixes: Dict with prefixes in addition to those included
in the JSON-LD context. Should map namespace prefixes
to IRIs.
context: Additional user-defined context that should be
returned on top of the default context. It may be a
string with an URL to the user-defined context, a dict
with the user-defined context or a sequence of strings
and dicts.
strip: Whether to strip leading and trailing whitespaces
from cells.
Returns:
New TableDoc instance.
"""
# Store the header as keys in a dict to keep ordering
header = {}
def addheader(d, prefix=""):
"""Add keys in `d` to header.
Nested dicts will result in dot-separated keys.
"""
for k, v in d.items():
if isinstance(v, dict):
addheader(v, k + ".")
else:
header[prefix + k] = True
# Assign the header
for d in dicts:
addheader(d)
# Assign table data. Nested dicts are accounted for
data = []
for dct in dicts:
row = []
for head in header:
d = dct
for key in head.split("."):
d = d.get(key, {})
row.append(d if d != {} else None)
data.append(row)
return TableDoc(
header=header.keys(), # type: ignore
data=data, # type: ignore
type=type,
prefixes=prefixes,
context=context,
strip=strip,
)
@staticmethod
def parse_csv(
csvfile: "Union[Iterable[str], Path, str]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[dict, list]]" = None,
encoding: str = "utf-8",
dialect: "Optional[Union[csv.Dialect, str]]" = None,
**kwargs,
) -> "TableDoc":
# pylint: disable=line-too-long
"""Parse a csv file using the standard library csv module.
Arguments:
csvfile: Name of CSV file to parse or an iterable of strings.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and "generator"
or an IRI to a class in an ontology. Defaults to
"dataset".
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
context: Dict with user-defined JSON-LD context.
encoding: The encoding of the csv file. Note that Excel may
encode as "ISO-8859" (which was commonly used in the 1990th).
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
Returns:
New TableDoc instance.
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
def read(f, dialect):
"""Return csv reader from file-like object `f`."""
if dialect is None and not kwargs:
dialect = csv.Sniffer().sniff(f.read(1024), delimiters=",;\t ")
f.seek(0)
reader = csv.reader(f, dialect=dialect, **kwargs)
header = next(reader)
data = list(reader)
return header, data
if isinstance(csvfile, (str, Path)):
with openfile(csvfile, mode="rt", encoding=encoding) as f:
header, data = read(f, dialect)
else:
header, data = read(csvfile, dialect)
return TableDoc(
header=header,
data=data,
type=type,
prefixes=prefixes,
context=context,
)
def write_csv(
self,
csvfile: "Union[Path, str, Writer]",
encoding: str = "utf-8",
dialect: "Union[csv.Dialect, str]" = "excel",
**kwargs,
) -> None:
# pylint: disable=line-too-long
"""Write the table to a csv file using the standard library csv module.
Arguments:
csvfile: File-like object or name of CSV file to write.
encoding: The encoding of the csv file.
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
def write(f):
writer = csv.writer(f, dialect=dialect, **kwargs)
writer.writerow(self.header)
for row in self.data:
writer.writerow(row)
if isinstance(csvfile, (str, Path)):
with open(csvfile, mode="wt", encoding=encoding) as f:
write(f)
else:
write(csvfile)
asdicts(self)
¶
Return the table as a list of dicts.
Source code in tripper/dataset/tabledoc.py
def asdicts(self) -> "List[dict]":
"""Return the table as a list of dicts."""
kw = {"_context": self.context} if self.context else {}
results = []
for row in self.data:
d = AttrDict()
for i, colname in enumerate(self.header):
cell = row[i].strip() if row[i] and self.strip else row[i]
if cell:
addnested(
d, colname.strip() if self.strip else colname, cell
)
jsonld = as_jsonld(
d, type=self.type, prefixes=self.prefixes, **kw # type: ignore
)
results.append(jsonld)
return results
fromdicts(dicts, type='dataset', prefixes=None, context=None, strip=True)
staticmethod
¶
Create new TableDoc instance from a sequence of dicts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
dicts |
Sequence[dict] |
Sequence of single-resource dicts. |
required |
type |
Optional[str] |
Type of data to save (applies to all rows). Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset". |
'dataset' |
prefixes |
Optional[dict] |
Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. |
None |
context |
Optional[Union[str, dict, list]] |
Additional user-defined context that should be returned on top of the default context. It may be a string with an URL to the user-defined context, a dict with the user-defined context or a sequence of strings and dicts. |
None |
strip |
bool |
Whether to strip leading and trailing whitespaces from cells. |
True |
Returns:
Type | Description |
---|---|
TableDoc |
New TableDoc instance. |
Source code in tripper/dataset/tabledoc.py
@staticmethod
def fromdicts(
dicts: "Sequence[dict]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[str, dict, list]]" = None,
strip: bool = True,
) -> "TableDoc":
"""Create new TableDoc instance from a sequence of dicts.
Arguments:
dicts: Sequence of single-resource dicts.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and
"generator" or an IRI to a class in an ontology.
Defaults to "dataset".
prefixes: Dict with prefixes in addition to those included
in the JSON-LD context. Should map namespace prefixes
to IRIs.
context: Additional user-defined context that should be
returned on top of the default context. It may be a
string with an URL to the user-defined context, a dict
with the user-defined context or a sequence of strings
and dicts.
strip: Whether to strip leading and trailing whitespaces
from cells.
Returns:
New TableDoc instance.
"""
# Store the header as keys in a dict to keep ordering
header = {}
def addheader(d, prefix=""):
"""Add keys in `d` to header.
Nested dicts will result in dot-separated keys.
"""
for k, v in d.items():
if isinstance(v, dict):
addheader(v, k + ".")
else:
header[prefix + k] = True
# Assign the header
for d in dicts:
addheader(d)
# Assign table data. Nested dicts are accounted for
data = []
for dct in dicts:
row = []
for head in header:
d = dct
for key in head.split("."):
d = d.get(key, {})
row.append(d if d != {} else None)
data.append(row)
return TableDoc(
header=header.keys(), # type: ignore
data=data, # type: ignore
type=type,
prefixes=prefixes,
context=context,
strip=strip,
)
parse_csv(csvfile, type='dataset', prefixes=None, context=None, encoding='utf-8', dialect=None, **kwargs)
staticmethod
¶
Parse a csv file using the standard library csv module.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
csvfile |
Union[Iterable[str], Path, str] |
Name of CSV file to parse or an iterable of strings. |
required |
type |
Optional[str] |
Type of data to save (applies to all rows). Should either be one of the pre-defined names: "dataset", "distribution", "accessService", "parser" and "generator" or an IRI to a class in an ontology. Defaults to "dataset". |
'dataset' |
prefixes |
Optional[dict] |
Dict with prefixes in addition to those included in the JSON-LD context. Should map namespace prefixes to IRIs. |
None |
context |
Optional[Union[dict, list]] |
Dict with user-defined JSON-LD context. |
None |
encoding |
str |
The encoding of the csv file. Note that Excel may encode as "ISO-8859" (which was commonly used in the 1990th). |
'utf-8' |
dialect |
Optional[Union[csv.Dialect, str]] |
A subclass of csv.Dialect, or the name of the dialect,
specifying how the |
None |
kwargs |
Additional keyword arguments overriding individual formatting parameters. For more details, see [Dialects and Formatting Parameters]. |
{} |
Returns:
Type | Description |
---|---|
TableDoc |
New TableDoc instance. |
References:
Source code in tripper/dataset/tabledoc.py
@staticmethod
def parse_csv(
csvfile: "Union[Iterable[str], Path, str]",
type: "Optional[str]" = "dataset",
prefixes: "Optional[dict]" = None,
context: "Optional[Union[dict, list]]" = None,
encoding: str = "utf-8",
dialect: "Optional[Union[csv.Dialect, str]]" = None,
**kwargs,
) -> "TableDoc":
# pylint: disable=line-too-long
"""Parse a csv file using the standard library csv module.
Arguments:
csvfile: Name of CSV file to parse or an iterable of strings.
type: Type of data to save (applies to all rows). Should
either be one of the pre-defined names: "dataset",
"distribution", "accessService", "parser" and "generator"
or an IRI to a class in an ontology. Defaults to
"dataset".
prefixes: Dict with prefixes in addition to those included in the
JSON-LD context. Should map namespace prefixes to IRIs.
context: Dict with user-defined JSON-LD context.
encoding: The encoding of the csv file. Note that Excel may
encode as "ISO-8859" (which was commonly used in the 1990th).
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
Returns:
New TableDoc instance.
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
def read(f, dialect):
"""Return csv reader from file-like object `f`."""
if dialect is None and not kwargs:
dialect = csv.Sniffer().sniff(f.read(1024), delimiters=",;\t ")
f.seek(0)
reader = csv.reader(f, dialect=dialect, **kwargs)
header = next(reader)
data = list(reader)
return header, data
if isinstance(csvfile, (str, Path)):
with openfile(csvfile, mode="rt", encoding=encoding) as f:
header, data = read(f, dialect)
else:
header, data = read(csvfile, dialect)
return TableDoc(
header=header,
data=data,
type=type,
prefixes=prefixes,
context=context,
)
save(self, ts)
¶
Save tabular datadocumentation to triplestore.
Source code in tripper/dataset/tabledoc.py
def save(self, ts: Triplestore) -> None:
"""Save tabular datadocumentation to triplestore."""
for d in self.asdicts():
save_dict(ts, d)
write_csv(self, csvfile, encoding='utf-8', dialect='excel', **kwargs)
¶
Write the table to a csv file using the standard library csv module.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
csvfile |
Union[Path, str, Writer] |
File-like object or name of CSV file to write. |
required |
encoding |
str |
The encoding of the csv file. |
'utf-8' |
dialect |
Union[csv.Dialect, str] |
A subclass of csv.Dialect, or the name of the dialect,
specifying how the |
'excel' |
kwargs |
Additional keyword arguments overriding individual formatting parameters. For more details, see [Dialects and Formatting Parameters]. |
{} |
References:
Source code in tripper/dataset/tabledoc.py
def write_csv(
self,
csvfile: "Union[Path, str, Writer]",
encoding: str = "utf-8",
dialect: "Union[csv.Dialect, str]" = "excel",
**kwargs,
) -> None:
# pylint: disable=line-too-long
"""Write the table to a csv file using the standard library csv module.
Arguments:
csvfile: File-like object or name of CSV file to write.
encoding: The encoding of the csv file.
dialect: A subclass of csv.Dialect, or the name of the dialect,
specifying how the `csvfile` is formatted. For more details,
see [Dialects and Formatting Parameters].
kwargs: Additional keyword arguments overriding individual
formatting parameters. For more details, see
[Dialects and Formatting Parameters].
References:
[Dialects and Formatting Parameters]: https://docs.python.org/3/library/csv.html#dialects-and-formatting-parameters
"""
def write(f):
writer = csv.writer(f, dialect=dialect, **kwargs)
writer.writerow(self.header)
for row in self.data:
writer.writerow(row)
if isinstance(csvfile, (str, Path)):
with open(csvfile, mode="wt", encoding=encoding) as f:
write(f)
else:
write(csvfile)