Skip to content

parse_excel

Strategy for parsing an Excel spreadsheet to a DLite instance.

DLiteExcelParseConfig

Bases: AttrDict

Configuration for DLite Excel parser.

Source code in oteapi_dlite/strategies/parse_excel.py
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
class DLiteExcelParseConfig(AttrDict):
    """Configuration for DLite Excel parser."""

    metadata: Annotated[
        Optional[HttpUrl],
        Field(
            description=(
                "URI of DLite metadata to return.  If not provided, the "
                "metadata will be inferred from the excel file."
            ),
        ),
    ] = None

    id: Annotated[
        Optional[str], Field(description="Optional id on new instance.")
    ] = None

    label: Annotated[
        Optional[str],
        Field(
            description="Optional label for new instance in collection.",
        ),
    ] = "excel-data"

    excel_config: Annotated[
        XLSXParseConfig,
        Field(
            description="DLite-specific excel configurations.",
        ),
    ]
    storage_path: Annotated[
        Optional[str],
        Field(
            description="Path to metadata storage",
        ),
    ] = None

excel_config: Annotated[XLSXParseConfig, Field(description='DLite-specific excel configurations.')] instance-attribute

id: Annotated[Optional[str], Field(description='Optional id on new instance.')] = None class-attribute instance-attribute

label: Annotated[Optional[str], Field(description='Optional label for new instance in collection.')] = 'excel-data' class-attribute instance-attribute

metadata: Annotated[Optional[HttpUrl], Field(description='URI of DLite metadata to return. If not provided, the metadata will be inferred from the excel file.')] = None class-attribute instance-attribute

storage_path: Annotated[Optional[str], Field(description='Path to metadata storage')] = None class-attribute instance-attribute

DLiteExcelParseResourceConfig

Bases: ResourceConfig

DLite excel parse strategy resource config.

Source code in oteapi_dlite/strategies/parse_excel.py
67
68
69
70
71
72
73
class DLiteExcelParseResourceConfig(ResourceConfig):
    """DLite excel parse strategy resource config."""

    configuration: Annotated[
        DLiteExcelParseConfig,
        Field(description="DLite excel parse strategy-specific configuration."),
    ]

configuration: Annotated[DLiteExcelParseConfig, Field(description='DLite excel parse strategy-specific configuration.')] instance-attribute

DLiteExcelSessionUpdate

Bases: DLiteSessionUpdate

Class for returning values from DLite excel parser.

Source code in oteapi_dlite/strategies/parse_excel.py
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
class DLiteExcelSessionUpdate(DLiteSessionUpdate):
    """Class for returning values from DLite excel parser."""

    inst_uuid: Annotated[
        str,
        Field(
            description="UUID of new instance.",
        ),
    ]
    label: Annotated[
        str,
        Field(
            description="Label of the new instance in the collection.",
        ),
    ]

inst_uuid: Annotated[str, Field(description='UUID of new instance.')] instance-attribute

label: Annotated[str, Field(description='Label of the new instance in the collection.')] instance-attribute

DLiteExcelStrategy

Parse strategy for Excel files.

Registers strategies:

  • ("mediaType", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
Source code in oteapi_dlite/strategies/parse_excel.py
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
@dataclass
class DLiteExcelStrategy:
    """Parse strategy for Excel files.

    **Registers strategies**:

    - `("mediaType",
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")`

    """

    parse_config: DLiteExcelParseResourceConfig

    def initialize(
        self,
        session: Optional[dict[str, Any]] = None,
    ) -> DLiteSessionUpdate:
        """Initialize."""
        return DLiteSessionUpdate(collection_id=get_collection(session).uuid)

    def get(
        self, session: Optional[dict[str, Any]] = None
    ) -> DLiteExcelSessionUpdate:
        """Execute the strategy.

        This method will be called through the strategy-specific endpoint
        of the OTE-API Services.

        Parameters:
            session: A session-specific dictionary context.

        Returns:
            DLite instance.

        """
        config = self.parse_config.configuration

        xlsx_config = self.parse_config.model_dump()
        xlsx_config["configuration"] = config.excel_config
        xlsx_config["mediaType"] = (
            "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
        )
        parser: IParseStrategy = XLSXParseStrategy(xlsx_config)
        columns: dict[str, Any] = parser.get(session)["data"]

        names, units = zip(*[split_column_name(column) for column in columns])
        rec = dict2recarray(columns, names=names)

        if not isinstance(units, (list, tuple)):
            # This check is to satisfy mypy for the `infer_metadata` call below.
            raise TypeError(
                f"units must be a list or tuple, instead it was {type(units)}"
            )

        if config.metadata:
            if config.storage_path is not None:
                for storage_path in config.storage_path.split("|"):
                    dlite.storage_path.append(storage_path)
            meta = dlite.get_instance(config.metadata)
            # check the metadata config would go here
        else:
            meta = infer_metadata(rec, units=units)

        inst = meta(dimensions=[len(rec)], id=config.id)
        for name in names:
            inst[name] = rec[name]

        # Insert inst into collection
        coll = get_collection(session)
        coll.add(config.label, inst)

        update_collection(coll)
        return DLiteExcelSessionUpdate(
            collection_id=coll.uuid,
            inst_uuid=inst.uuid,
            label=config.label,
        )

parse_config: DLiteExcelParseResourceConfig instance-attribute

get(session=None)

Execute the strategy.

This method will be called through the strategy-specific endpoint of the OTE-API Services.

Parameters:

Name Type Description Default
session Optional[dict[str, Any]]

A session-specific dictionary context.

None

Returns:

Type Description
DLiteExcelSessionUpdate

DLite instance.

Source code in oteapi_dlite/strategies/parse_excel.py
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
def get(
    self, session: Optional[dict[str, Any]] = None
) -> DLiteExcelSessionUpdate:
    """Execute the strategy.

    This method will be called through the strategy-specific endpoint
    of the OTE-API Services.

    Parameters:
        session: A session-specific dictionary context.

    Returns:
        DLite instance.

    """
    config = self.parse_config.configuration

    xlsx_config = self.parse_config.model_dump()
    xlsx_config["configuration"] = config.excel_config
    xlsx_config["mediaType"] = (
        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
    )
    parser: IParseStrategy = XLSXParseStrategy(xlsx_config)
    columns: dict[str, Any] = parser.get(session)["data"]

    names, units = zip(*[split_column_name(column) for column in columns])
    rec = dict2recarray(columns, names=names)

    if not isinstance(units, (list, tuple)):
        # This check is to satisfy mypy for the `infer_metadata` call below.
        raise TypeError(
            f"units must be a list or tuple, instead it was {type(units)}"
        )

    if config.metadata:
        if config.storage_path is not None:
            for storage_path in config.storage_path.split("|"):
                dlite.storage_path.append(storage_path)
        meta = dlite.get_instance(config.metadata)
        # check the metadata config would go here
    else:
        meta = infer_metadata(rec, units=units)

    inst = meta(dimensions=[len(rec)], id=config.id)
    for name in names:
        inst[name] = rec[name]

    # Insert inst into collection
    coll = get_collection(session)
    coll.add(config.label, inst)

    update_collection(coll)
    return DLiteExcelSessionUpdate(
        collection_id=coll.uuid,
        inst_uuid=inst.uuid,
        label=config.label,
    )

initialize(session=None)

Initialize.

Source code in oteapi_dlite/strategies/parse_excel.py
106
107
108
109
110
111
def initialize(
    self,
    session: Optional[dict[str, Any]] = None,
) -> DLiteSessionUpdate:
    """Initialize."""
    return DLiteSessionUpdate(collection_id=get_collection(session).uuid)

infer_metadata(rec, units)

Infer dlite metadata from recarray rec.

Source code in oteapi_dlite/strategies/parse_excel.py
181
182
183
184
185
186
187
188
189
190
191
192
193
194
def infer_metadata(rec: np.recarray, units: tuple[str, ...]) -> dlite.Instance:
    """Infer dlite metadata from recarray `rec`."""
    rnd = getrandbits(128)
    uri = f"http://onto-ns.com/meta/1.0/generated_from_excel_{rnd:0x}"
    metadata = DataModel(
        uri,
        description="Generated datamodel from excel file.",
    )
    metadata.add_dimension("nrows", "Number of rows.")
    for i, name in enumerate(rec.dtype.names):
        dtype = rec[name].dtype
        ptype = "string" if dtype.kind == "U" else dtype.name
        metadata.add_property(name, type=ptype, shape=["nrows"], unit=units[i])
    return metadata.get()

split_column_name(column)

Split column name into a (name, unit) tuple.

Source code in oteapi_dlite/strategies/parse_excel.py
172
173
174
175
176
177
178
def split_column_name(column: str) -> tuple[str, str]:
    """Split column name into a (name, unit) tuple."""
    match = re.match(r"\s*([^ ([<]+)\s*[([<]?([^] )>]*)[])>]?", column)
    if not match:
        return column, ""
    name, unit = match.groups()
    return name, unit