excel_xlsx¶
Strategy class for workbook/xlsx.
XLSXParseDataModel (BaseModel)
pydantic-model
¶
Data model for retrieving a rectangular section of an Excel sheet.
Source code in oteapi/strategies/parse/excel_xlsx.py
class XLSXParseDataModel(BaseModel):
"""Data model for retrieving a rectangular section of an Excel sheet."""
worksheet: str = Field(..., description="Name of worksheet to load.")
row_from: Optional[int] = Field(
None,
description="Excel row number of first row. Defaults to first assigned row.",
)
col_from: Optional[Union[int, str]] = Field(
None,
description=(
"Excel column number or label of first column. Defaults to first assigned "
"column."
),
)
row_to: Optional[int] = Field(
None, description="Excel row number of last row. Defaults to last assigned row."
)
col_to: Optional[Union[int, str]] = Field(
None,
description=(
"Excel column number or label of last column. Defaults to last assigned "
"column."
),
)
header_row: Optional[int] = Field(
None,
description=(
"Row number with the headers. Defaults to `1` if header is given, "
"otherwise `None`."
),
)
header: Optional[List[str]] = Field(
None,
description=(
"Optional list of column names, specifying the columns to return. "
"These names they should match cells in `header_row`."
),
)
new_header: Optional[List[str]] = Field(
None,
description=(
"Optional list of new column names replacing `header` in the output."
),
)
col_from: Union[int, str]
pydantic-field
¶
Excel column number or label of first column. Defaults to first assigned column.
col_to: Union[int, str]
pydantic-field
¶
Excel column number or label of last column. Defaults to last assigned column.
header: List[str]
pydantic-field
¶
Optional list of column names, specifying the columns to return. These names they should match cells in header_row
.
header_row: int
pydantic-field
¶
Row number with the headers. Defaults to 1
if header is given, otherwise None
.
new_header: List[str]
pydantic-field
¶
Optional list of new column names replacing header
in the output.
row_from: int
pydantic-field
¶
Excel row number of first row. Defaults to first assigned row.
row_to: int
pydantic-field
¶
Excel row number of last row. Defaults to last assigned row.
worksheet: str
pydantic-field
required
¶
Name of worksheet to load.
XLSXParseStrategy
dataclass
¶
Parse strategy for Excel XLSX files.
Registers strategies:
("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
Source code in oteapi/strategies/parse/excel_xlsx.py
@dataclass
@StrategyFactory.register(
("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
)
class XLSXParseStrategy:
"""Parse strategy for Excel XLSX files.
**Registers strategies**:
- `("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")`
"""
resource_config: "ResourceConfig"
def initialize(
self, session: "Optional[Dict[str, Any]]" = None
) -> "Dict[str, Any]":
"""Initialize."""
return {}
def parse(self, session: "Optional[Dict[str, Any]]" = None) -> "Dict[str, Any]":
"""Parses selected region of an excel file.
Returns:
A dict with column-name/column-value pairs. The values are lists.
"""
model = XLSXParseDataModel(
**self.resource_config.configuration, extra=Extra.ignore
)
downloader = create_download_strategy(self.resource_config)
output = downloader.get()
cache = DataCache(self.resource_config.configuration)
with cache.getfile(key=output["key"], suffix=".xlsx") as filename:
workbook = load_workbook(filename=filename, read_only=True, data_only=True)
worksheet = workbook[model.worksheet]
set_model_defaults(model, worksheet)
columns = get_column_indices(model, worksheet)
data = []
for row in worksheet.iter_rows(
min_row=model.row_from,
max_row=model.row_to,
min_col=min(columns),
max_col=max(columns),
):
data.append([row[c - 1].value for c in columns])
if model.header_row:
row = worksheet.iter_rows(
min_row=model.header_row,
max_row=model.header_row,
min_col=min(columns),
max_col=max(columns),
).__next__()
header = [row[c - 1].value for c in columns]
else:
header = None
if model.new_header:
nhead = len(header) if header else len(data[0]) if data else 0
if len(model.new_header) != nhead:
raise TypeError(
f"length of `new_header` (={len(model.new_header)}) "
f"doesn't match number of columns (={len(header) if header else 0})"
)
if header:
for i, val in enumerate(model.new_header):
if val is not None:
header[i] = val
elif data:
header = model.new_header
if header is None:
header = [get_column_letter(col + 1) for col in range(len(data))]
transposed = list(map(list, zip(*data)))
return {k: v for k, v in zip(header, transposed)}
initialize(self, session=None)
¶
Initialize.
Source code in oteapi/strategies/parse/excel_xlsx.py
def initialize(
self, session: "Optional[Dict[str, Any]]" = None
) -> "Dict[str, Any]":
"""Initialize."""
return {}
parse(self, session=None)
¶
Parses selected region of an excel file.
Returns:
Type | Description |
---|---|
Dict[str, Any] |
A dict with column-name/column-value pairs. The values are lists. |
Source code in oteapi/strategies/parse/excel_xlsx.py
def parse(self, session: "Optional[Dict[str, Any]]" = None) -> "Dict[str, Any]":
"""Parses selected region of an excel file.
Returns:
A dict with column-name/column-value pairs. The values are lists.
"""
model = XLSXParseDataModel(
**self.resource_config.configuration, extra=Extra.ignore
)
downloader = create_download_strategy(self.resource_config)
output = downloader.get()
cache = DataCache(self.resource_config.configuration)
with cache.getfile(key=output["key"], suffix=".xlsx") as filename:
workbook = load_workbook(filename=filename, read_only=True, data_only=True)
worksheet = workbook[model.worksheet]
set_model_defaults(model, worksheet)
columns = get_column_indices(model, worksheet)
data = []
for row in worksheet.iter_rows(
min_row=model.row_from,
max_row=model.row_to,
min_col=min(columns),
max_col=max(columns),
):
data.append([row[c - 1].value for c in columns])
if model.header_row:
row = worksheet.iter_rows(
min_row=model.header_row,
max_row=model.header_row,
min_col=min(columns),
max_col=max(columns),
).__next__()
header = [row[c - 1].value for c in columns]
else:
header = None
if model.new_header:
nhead = len(header) if header else len(data[0]) if data else 0
if len(model.new_header) != nhead:
raise TypeError(
f"length of `new_header` (={len(model.new_header)}) "
f"doesn't match number of columns (={len(header) if header else 0})"
)
if header:
for i, val in enumerate(model.new_header):
if val is not None:
header[i] = val
elif data:
header = model.new_header
if header is None:
header = [get_column_letter(col + 1) for col in range(len(data))]
transposed = list(map(list, zip(*data)))
return {k: v for k, v in zip(header, transposed)}
get_column_indices(model, worksheet)
¶
Helper function returning a list of column indices.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
XLSXParseDataModel |
The parsed data model. |
required |
worksheet |
Worksheet |
Excel worksheet, from which the header values will be retrieved. |
required |
Returns:
Type | Description |
---|---|
Iterable[int] |
A list of column indices. |
Source code in oteapi/strategies/parse/excel_xlsx.py
def get_column_indices(
model: XLSXParseDataModel, worksheet: "Worksheet"
) -> "Iterable[int]":
"""Helper function returning a list of column indices.
Parameters:
model: The parsed data model.
worksheet: Excel worksheet, from which the header values will be retrieved.
Returns:
A list of column indices.
"""
if not isinstance(model.col_from, int) or not isinstance(model.col_to, int):
raise TypeError("Expected `model.col_from` and `model.col_to` to be integers.")
if model.header:
header_dict = {
worksheet.cell(model.header_row, col).value: col
for col in range(model.col_from, model.col_to + 1)
}
return [header_dict[h] for h in model.header]
return range(model.col_from, model.col_to + 1)
set_model_defaults(model, worksheet)
¶
Update data model model
with default values obtained from worksheet
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
model |
XLSXParseDataModel |
The parsed data model. |
required |
worksheet |
Worksheet |
Excel worksheet, from which the default values will be obtained. |
required |
Source code in oteapi/strategies/parse/excel_xlsx.py
def set_model_defaults(model: XLSXParseDataModel, worksheet: "Worksheet") -> None:
"""Update data model `model` with default values obtained from `worksheet`.
Parameters:
model: The parsed data model.
worksheet: Excel worksheet, from which the default values will be obtained.
"""
if model.row_from is None:
if model.header:
# assume that data starts on the first row after the header
model.row_from = model.header_row + 1 if model.header_row else 1
else:
model.row_from = worksheet.min_row
if model.row_to is None:
model.row_to = worksheet.max_row
if model.col_from is None:
model.col_from = worksheet.min_column
elif isinstance(model.col_from, str):
model.col_from = column_index_from_string(model.col_from)
if model.col_to is None:
model.col_to = worksheet.max_column
elif isinstance(model.col_to, str):
model.col_to = column_index_from_string(model.col_to)
if model.header and not model.header_row:
model.header_row = 1