Skip to content

parse_excel

Strategy for parsing an Excel spreadsheet to a DLite instance.

DLiteExcelParseConfig

Bases: DLiteResult

Configuration for DLite Excel parser.

Source code in oteapi_dlite/strategies/parse_excel.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class DLiteExcelParseConfig(DLiteResult):
    """Configuration for DLite Excel parser."""

    # Resource config
    downloadUrl: Annotated[
        Optional[HostlessAnyUrl],
        Field(
            description=ResourceConfig.model_fields["downloadUrl"].description
        ),
    ] = None

    mediaType: Annotated[
        Literal[
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        ],
        Field(description=ResourceConfig.model_fields["mediaType"].description),
    ] = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"

    # Parser config
    id: Annotated[
        Optional[str], Field(description="Optional id on new instance.")
    ] = None

    label: Annotated[
        Optional[str],
        Field(
            description="Optional label for new instance in collection.",
        ),
    ] = "excel-data"

    excel_config: Annotated[
        XLSXParseConfig,
        Field(
            description="DLite-specific excel configurations.",
        ),
    ]
    storage_path: Annotated[
        Optional[str],
        Field(
            description="Path to metadata storage",
        ),
    ] = None

downloadUrl: Annotated[Optional[HostlessAnyUrl], Field(description=ResourceConfig.model_fields['downloadUrl'].description)] = None class-attribute instance-attribute

excel_config: Annotated[XLSXParseConfig, Field(description='DLite-specific excel configurations.')] instance-attribute

id: Annotated[Optional[str], Field(description='Optional id on new instance.')] = None class-attribute instance-attribute

label: Annotated[Optional[str], Field(description='Optional label for new instance in collection.')] = 'excel-data' class-attribute instance-attribute

mediaType: Annotated[Literal['application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'], Field(description=ResourceConfig.model_fields['mediaType'].description)] = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet' class-attribute instance-attribute

storage_path: Annotated[Optional[str], Field(description='Path to metadata storage')] = None class-attribute instance-attribute

DLiteExcelParserConfig

Bases: ParserConfig

DLite excel parse strategy resource config.

Source code in oteapi_dlite/strategies/parse_excel.py
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
class DLiteExcelParserConfig(ParserConfig):
    """DLite excel parse strategy resource config."""

    parserType: Annotated[
        Literal["application/vnd.dlite-xlsx"],
        Field(description=ParserConfig.model_fields["parserType"].description),
    ] = "application/vnd.dlite-xlsx"
    configuration: Annotated[
        DLiteExcelParseConfig,
        Field(description="DLite excel parse strategy-specific configuration."),
    ]
    entity: Annotated[
        Optional[AnyHttpUrl],
        Field(
            description=(
                "URI of DLite metadata to return. If not provided, the "
                "metadata will be inferred from the excel file."
            ),
        ),
    ] = None

configuration: Annotated[DLiteExcelParseConfig, Field(description='DLite excel parse strategy-specific configuration.')] instance-attribute

entity: Annotated[Optional[AnyHttpUrl], Field(description='URI of DLite metadata to return. If not provided, the metadata will be inferred from the excel file.')] = None class-attribute instance-attribute

parserType: Annotated[Literal['application/vnd.dlite-xlsx'], Field(description=ParserConfig.model_fields['parserType'].description)] = 'application/vnd.dlite-xlsx' class-attribute instance-attribute

DLiteExcelSessionUpdate

Bases: DLiteResult

Class for returning values from DLite excel parser.

Source code in oteapi_dlite/strategies/parse_excel.py
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
class DLiteExcelSessionUpdate(DLiteResult):
    """Class for returning values from DLite excel parser."""

    inst_uuid: Annotated[
        str,
        Field(
            description="UUID of new instance.",
        ),
    ]
    label: Annotated[
        str,
        Field(
            description="Label of the new instance in the collection.",
        ),
    ]

inst_uuid: Annotated[str, Field(description='UUID of new instance.')] instance-attribute

label: Annotated[str, Field(description='Label of the new instance in the collection.')] instance-attribute

DLiteExcelStrategy

Parse strategy for Excel files.

Registers strategies:

  • ("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
Source code in oteapi_dlite/strategies/parse_excel.py
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
@dataclass
class DLiteExcelStrategy:
    """Parse strategy for Excel files.

    **Registers strategies**:

    - `("mediaType",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")`

    """

    parse_config: DLiteExcelParserConfig

    def initialize(self) -> DLiteResult:
        """Initialize."""
        return DLiteResult(
            collection_id=get_collection(
                self.parse_config.configuration.collection_id
            ).uuid
        )

    def get(self) -> DLiteExcelSessionUpdate:
        """Execute the strategy.

        This method will be called through the strategy-specific endpoint
        of the OTE-API Services.

        Returns:
            DLite instance.

        """
        config = self.parse_config.configuration

        if config.downloadUrl is None:
            raise ValueError("downloadUrl is required.")
        if config.mediaType is None:
            raise ValueError("mediaType is required.")

        xlsx_config = {
            "parserType": "parser/excel_xlsx",
            "configuration": config.excel_config.model_dump(),
            "entity": (
                self.parse_config.entity
                if self.parse_config.entity
                else "https://example.org"
            ),
        }
        xlsx_config["configuration"].update(
            {
                "downloadUrl": config.downloadUrl,
                "mediaType": config.mediaType,
            }
        )
        parser = create_strategy("parse", xlsx_config)
        columns: dict[str, Any] = parser.get()["data"]

        names, units = zip(*[split_column_name(column) for column in columns])
        rec = dict2recarray(columns, names=names)

        if not isinstance(units, (list, tuple)):
            # This check is to satisfy mypy for the `infer_metadata` call below.
            raise TypeError(
                f"units must be a list or tuple, instead it was {type(units)}"
            )

        meta_uri = self.parse_config.entity
        if meta_uri:
            if config.storage_path is not None:
                for storage_path in config.storage_path.split("|"):
                    dlite.storage_path.append(storage_path)
            meta = dlite.get_instance(str(meta_uri))
            # check the metadata config would go here
        else:
            meta = infer_metadata(rec, units=units)

        inst = meta(dimensions=[len(rec)], id=config.id)
        for name in names:
            inst[name] = rec[name]

        # Insert inst into collection
        coll = get_collection(config.collection_id)
        coll.add(config.label, inst)

        update_collection(coll)
        return DLiteExcelSessionUpdate(
            collection_id=coll.uuid,
            inst_uuid=inst.uuid,
            label=config.label,
        )

parse_config: DLiteExcelParserConfig instance-attribute

get()

Execute the strategy.

This method will be called through the strategy-specific endpoint of the OTE-API Services.

Returns:

Type Description
DLiteExcelSessionUpdate

DLite instance.

Source code in oteapi_dlite/strategies/parse_excel.py
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def get(self) -> DLiteExcelSessionUpdate:
    """Execute the strategy.

    This method will be called through the strategy-specific endpoint
    of the OTE-API Services.

    Returns:
        DLite instance.

    """
    config = self.parse_config.configuration

    if config.downloadUrl is None:
        raise ValueError("downloadUrl is required.")
    if config.mediaType is None:
        raise ValueError("mediaType is required.")

    xlsx_config = {
        "parserType": "parser/excel_xlsx",
        "configuration": config.excel_config.model_dump(),
        "entity": (
            self.parse_config.entity
            if self.parse_config.entity
            else "https://example.org"
        ),
    }
    xlsx_config["configuration"].update(
        {
            "downloadUrl": config.downloadUrl,
            "mediaType": config.mediaType,
        }
    )
    parser = create_strategy("parse", xlsx_config)
    columns: dict[str, Any] = parser.get()["data"]

    names, units = zip(*[split_column_name(column) for column in columns])
    rec = dict2recarray(columns, names=names)

    if not isinstance(units, (list, tuple)):
        # This check is to satisfy mypy for the `infer_metadata` call below.
        raise TypeError(
            f"units must be a list or tuple, instead it was {type(units)}"
        )

    meta_uri = self.parse_config.entity
    if meta_uri:
        if config.storage_path is not None:
            for storage_path in config.storage_path.split("|"):
                dlite.storage_path.append(storage_path)
        meta = dlite.get_instance(str(meta_uri))
        # check the metadata config would go here
    else:
        meta = infer_metadata(rec, units=units)

    inst = meta(dimensions=[len(rec)], id=config.id)
    for name in names:
        inst[name] = rec[name]

    # Insert inst into collection
    coll = get_collection(config.collection_id)
    coll.add(config.label, inst)

    update_collection(coll)
    return DLiteExcelSessionUpdate(
        collection_id=coll.uuid,
        inst_uuid=inst.uuid,
        label=config.label,
    )

initialize()

Initialize.

Source code in oteapi_dlite/strategies/parse_excel.py
127
128
129
130
131
132
133
def initialize(self) -> DLiteResult:
    """Initialize."""
    return DLiteResult(
        collection_id=get_collection(
            self.parse_config.configuration.collection_id
        ).uuid
    )

infer_metadata(rec, units)

Infer dlite metadata from recarray rec.

Source code in oteapi_dlite/strategies/parse_excel.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
def infer_metadata(rec: np.recarray, units: tuple[str, ...]) -> dlite.Instance:
    """Infer dlite metadata from recarray `rec`."""
    rnd = getrandbits(128)
    uri = f"http://onto-ns.com/meta/1.0/generated_from_excel_{rnd:0x}"
    metadata = DataModel(
        uri,
        description="Generated datamodel from excel file.",
    )
    metadata.add_dimension("nrows", "Number of rows.")
    for i, name in enumerate(rec.dtype.names):
        dtype = rec[name].dtype
        ptype = "string" if dtype.kind == "U" else dtype.name
        metadata.add_property(name, type=ptype, shape=["nrows"], unit=units[i])
    return metadata.get()

split_column_name(column)

Split column name into a (name, unit) tuple.

Source code in oteapi_dlite/strategies/parse_excel.py
205
206
207
208
209
210
211
def split_column_name(column: str) -> tuple[str, str]:
    """Split column name into a (name, unit) tuple."""
    match = re.match(r"\s*([^ ([<]+)\s*[([<]?([^] )>]*)[])>]?", column)
    if not match:
        return column, ""
    name, unit = match.groups()
    return name, unit