Skip to content

excel_xlsx

Strategy class for workbook/xlsx.

SessionUpdateXLSXParse

Bases: SessionUpdate

Class for returning values from XLSXParse.

Source code in oteapi/strategies/parse/excel_xlsx.py
19
20
21
22
23
24
25
class SessionUpdateXLSXParse(SessionUpdate):
    """Class for returning values from XLSXParse."""

    data: Dict[str, list] = Field(
        ...,
        description="A dict with column-name/column-value pairs. The values are lists.",
    )

XLSXParseConfig

Bases: AttrDict

Data model for retrieving a rectangular section of an Excel sheet.

Source code in oteapi/strategies/parse/excel_xlsx.py
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
class XLSXParseConfig(AttrDict):
    """Data model for retrieving a rectangular section of an Excel sheet."""

    worksheet: str = Field(..., description="Name of worksheet to load.")
    row_from: Optional[int] = Field(
        None,
        description="Excel row number of first row. Defaults to first assigned row.",
    )
    col_from: Optional[Union[int, str]] = Field(
        None,
        description=(
            "Excel column number or label of first column. Defaults to first assigned "
            "column."
        ),
    )
    row_to: Optional[int] = Field(
        None, description="Excel row number of last row. Defaults to last assigned row."
    )
    col_to: Optional[Union[int, str]] = Field(
        None,
        description=(
            "Excel column number or label of last column. Defaults to last assigned "
            "column."
        ),
    )
    header_row: Optional[int] = Field(
        None,
        description=(
            "Row number with the headers. Defaults to `1` if header is given, "
            "otherwise `None`."
        ),
    )
    header: Optional[List[str]] = Field(
        None,
        description=(
            "Optional list of column names, specifying the columns to return. "
            "These names they should match cells in `header_row`."
        ),
    )
    new_header: Optional[List[str]] = Field(
        None,
        description=(
            "Optional list of new column names replacing `header` in the output."
        ),
    )
    download_config: AttrDict = Field(
        AttrDict(),
        description="Configurations provided to a download strategy.",
    )
    datacache_config: Optional[DataCacheConfig] = Field(
        None,
        description="Configurations for the data cache for retrieving the downloaded content.",
    )

XLSXParseResourceConfig

Bases: ResourceConfig

XLSX parse strategy resource config.

Source code in oteapi/strategies/parse/excel_xlsx.py
83
84
85
86
87
88
89
90
91
92
93
class XLSXParseResourceConfig(ResourceConfig):
    """XLSX parse strategy resource config."""

    mediaType: str = Field(
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
        const=True,
        description=ResourceConfig.__fields__["mediaType"].field_info.description,
    )
    configuration: XLSXParseConfig = Field(
        ..., description="SQLite parse strategy-specific configuration."
    )

XLSXParseStrategy

Parse strategy for Excel XLSX files.

Registers strategies:

  • ("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
Source code in oteapi/strategies/parse/excel_xlsx.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
@dataclass
class XLSXParseStrategy:
    """Parse strategy for Excel XLSX files.

    **Registers strategies**:

    - `("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")`

    """

    parse_config: XLSXParseResourceConfig

    def initialize(self, session: "Optional[Dict[str, Any]]" = None) -> SessionUpdate:
        """Initialize."""
        return SessionUpdate()

    def get(self, session: "Optional[Dict[str, Any]]" = None) -> SessionUpdateXLSXParse:
        """Parses selected region of an excel file.

        Returns:
            A dict with column-name/column-value pairs. The values are lists.

        """

        cache = DataCache(self.parse_config.configuration.datacache_config)
        if session is None:
            raise ValueError("Missing session")
        with cache.getfile(key=session["key"], suffix=".xlsx") as filename:
            # Note that we have to set read_only=False to ensure that
            # load_workbook() properly closes the xlsx file after reading.
            # Otherwise Windows will fail when the temporary file is removed
            # when leaving the with statement.
            workbook = load_workbook(filename=filename, read_only=False, data_only=True)

        worksheet = workbook[self.parse_config.configuration.worksheet]
        set_model_defaults(self.parse_config.configuration, worksheet)
        columns = get_column_indices(self.parse_config.configuration, worksheet)

        data = []
        for row in worksheet.iter_rows(
            min_row=self.parse_config.configuration.row_from,
            max_row=self.parse_config.configuration.row_to,
            min_col=min(columns),
            max_col=max(columns),
        ):
            data.append([row[c - 1].value for c in columns])

        if self.parse_config.configuration.header_row:
            row = worksheet.iter_rows(
                min_row=self.parse_config.configuration.header_row,
                max_row=self.parse_config.configuration.header_row,
                min_col=min(columns),
                max_col=max(columns),
            ).__next__()
            header = [row[c - 1].value for c in columns]
        else:
            header = None

        if self.parse_config.configuration.new_header:
            nhead = len(header) if header else len(data[0]) if data else 0
            if len(self.parse_config.configuration.new_header) != nhead:
                raise TypeError(
                    f"length of `new_header` (={len(self.parse_config.configuration.new_header)}) "
                    f"doesn't match number of columns (={len(header) if header else 0})"
                )
            if header:
                for i, val in enumerate(self.parse_config.configuration.new_header):
                    if val is not None:
                        header[i] = val
            elif data:
                header = self.parse_config.configuration.new_header

        if header is None:
            header = [get_column_letter(col + 1) for col in range(len(data))]

        transposed = [list(datum) for datum in zip(*data)]
        return SessionUpdateXLSXParse(
            data={key: value for key, value in zip(header, transposed)}
        )

get(session=None)

Parses selected region of an excel file.

Returns:

Type Description
SessionUpdateXLSXParse

A dict with column-name/column-value pairs. The values are lists.

Source code in oteapi/strategies/parse/excel_xlsx.py
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
def get(self, session: "Optional[Dict[str, Any]]" = None) -> SessionUpdateXLSXParse:
    """Parses selected region of an excel file.

    Returns:
        A dict with column-name/column-value pairs. The values are lists.

    """

    cache = DataCache(self.parse_config.configuration.datacache_config)
    if session is None:
        raise ValueError("Missing session")
    with cache.getfile(key=session["key"], suffix=".xlsx") as filename:
        # Note that we have to set read_only=False to ensure that
        # load_workbook() properly closes the xlsx file after reading.
        # Otherwise Windows will fail when the temporary file is removed
        # when leaving the with statement.
        workbook = load_workbook(filename=filename, read_only=False, data_only=True)

    worksheet = workbook[self.parse_config.configuration.worksheet]
    set_model_defaults(self.parse_config.configuration, worksheet)
    columns = get_column_indices(self.parse_config.configuration, worksheet)

    data = []
    for row in worksheet.iter_rows(
        min_row=self.parse_config.configuration.row_from,
        max_row=self.parse_config.configuration.row_to,
        min_col=min(columns),
        max_col=max(columns),
    ):
        data.append([row[c - 1].value for c in columns])

    if self.parse_config.configuration.header_row:
        row = worksheet.iter_rows(
            min_row=self.parse_config.configuration.header_row,
            max_row=self.parse_config.configuration.header_row,
            min_col=min(columns),
            max_col=max(columns),
        ).__next__()
        header = [row[c - 1].value for c in columns]
    else:
        header = None

    if self.parse_config.configuration.new_header:
        nhead = len(header) if header else len(data[0]) if data else 0
        if len(self.parse_config.configuration.new_header) != nhead:
            raise TypeError(
                f"length of `new_header` (={len(self.parse_config.configuration.new_header)}) "
                f"doesn't match number of columns (={len(header) if header else 0})"
            )
        if header:
            for i, val in enumerate(self.parse_config.configuration.new_header):
                if val is not None:
                    header[i] = val
        elif data:
            header = self.parse_config.configuration.new_header

    if header is None:
        header = [get_column_letter(col + 1) for col in range(len(data))]

    transposed = [list(datum) for datum in zip(*data)]
    return SessionUpdateXLSXParse(
        data={key: value for key, value in zip(header, transposed)}
    )

initialize(session=None)

Initialize.

Source code in oteapi/strategies/parse/excel_xlsx.py
165
166
167
def initialize(self, session: "Optional[Dict[str, Any]]" = None) -> SessionUpdate:
    """Initialize."""
    return SessionUpdate()

get_column_indices(model, worksheet)

Helper function returning a list of column indices.

Parameters:

Name Type Description Default
model XLSXParseConfig

The parsed data model.

required
worksheet Worksheet

Excel worksheet, from which the header values will be retrieved.

required

Returns:

Type Description
Iterable[int]

A list of column indices.

Source code in oteapi/strategies/parse/excel_xlsx.py
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
def get_column_indices(
    model: XLSXParseConfig, worksheet: "Worksheet"
) -> "Iterable[int]":
    """Helper function returning a list of column indices.

    Parameters:
        model: The parsed data model.
        worksheet: Excel worksheet, from which the header values will be retrieved.

    Returns:
        A list of column indices.

    """
    if not isinstance(model.col_from, int) or not isinstance(model.col_to, int):
        raise TypeError("Expected `model.col_from` and `model.col_to` to be integers.")

    if model.header:
        header_dict = {
            worksheet.cell(model.header_row, col).value: col
            for col in range(model.col_from, model.col_to + 1)
        }
        return [header_dict[h] for h in model.header]
    return range(model.col_from, model.col_to + 1)

set_model_defaults(model, worksheet)

Update data model model with default values obtained from worksheet.

Parameters:

Name Type Description Default
model XLSXParseConfig

The parsed data model.

required
worksheet Worksheet

Excel worksheet, from which the default values will be obtained.

required
Source code in oteapi/strategies/parse/excel_xlsx.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
def set_model_defaults(model: XLSXParseConfig, worksheet: "Worksheet") -> None:
    """Update data model `model` with default values obtained from `worksheet`.

    Parameters:
        model: The parsed data model.
        worksheet: Excel worksheet, from which the default values will be obtained.

    """
    if model.row_from is None:
        if model.header:
            # assume that data starts on the first row after the header
            model.row_from = model.header_row + 1 if model.header_row else 1
        else:
            model.row_from = worksheet.min_row

    if model.row_to is None:
        model.row_to = worksheet.max_row

    if model.col_from is None:
        model.col_from = worksheet.min_column
    elif isinstance(model.col_from, str):
        model.col_from = column_index_from_string(model.col_from)

    if model.col_to is None:
        model.col_to = worksheet.max_column
    elif isinstance(model.col_to, str):
        model.col_to = column_index_from_string(model.col_to)

    if model.header and not model.header_row:
        model.header_row = 1