Source code for Hive_ML.data_loader.feature_loader

from os import PathLike
from typing import Union, Tuple, List

import numpy
import numpy as np
import pandas as pd

from Hive_ML.utilities.feature_utils import get_feature_set_details, get_4D_feature_stats, flatten_4D_features


[docs] def load_feature_set( feature_set_filename: Union[str, PathLike], get_4D_stats: bool = True, flatten_features: bool = False, select_T: int = None, ) -> Tuple[numpy.ndarray, List[str], List[str], List[str], numpy.ndarray, numpy.ndarray, numpy.ndarray, numpy.ndarray]: """ Function to load a feature set from a filepath, including the Subject list, their corresponding labels and a list of feature names. If ``get_4D_stats`` is set to **True**, the 4D statistics of the feature set are returned (see :func:`utilities.feature_utils.get_4D_feature_stats` ) If ``flatten_features`` is set to **True**, the 3D feature set is flattened into a 2D set (see :func:`utilities.feature_utils.flatten_4D_features`) If ``select_T`` is set to an integer value, the specific sequence is extracted and returned from the 3D feature set. Parameters ---------- feature_set_filename : Feature set file path. get_4D_stats : Flag to compute and return sequence statistics. flatten_features : Flag to flatten features along the sequence dimension. select_T : Select and return only the specified sequence. Returns ------- Feature set Array , Subject list, Subject labels, List of feature names, Mean Sequence Array, SD Sequence Array, Sum Sequence Array, Mean Delta Array. """ if feature_set_filename.endswith(".xlsx"): feature_set = pd.read_excel(feature_set_filename, index_col=0) elif feature_set_filename.endswith(".csv"): feature_set = pd.read_csv(feature_set_filename, index_col=0) elif feature_set_filename.endswith(".pkl"): feature_set = pd.read_pickle(feature_set_filename, index_col=0) else: raise ValueError("Output file format not recognized, expected one of: '.xslx', '.csv', '.pkl' ") feature_set = feature_set.sort_values(by=["Subject_Label", "Subject_ID", "Sequence_Number"]) feature_list, subject_ids, subject_labels, feature_names = get_feature_set_details(feature_set) if get_4D_stats: mean_features, sum_features, std_features, mean_delta_features = get_4D_feature_stats(feature_list) else: mean_features, sum_features, std_features, mean_delta_features = None, None, None, None if flatten_features: feature_list, feature_names = flatten_4D_features(feature_list, feature_names) if select_T is not None: feature_list = np.array(feature_list).squeeze(axis=-2)[int(select_T), :, :] return ( np.array(feature_list), subject_ids, subject_labels, feature_names, mean_features, sum_features, std_features, mean_delta_features, )