Source code for probnet.models.base_net

#!/usr/bin/env python
# Created by "Thieu" at 11:03, 02/05/2025 ----------%                                                                               
#       Email: nguyenthieu2102@gmail.com            %                                                    
#       Github: https://github.com/thieu1995        %                         
# --------------------------------------------------%

import inspect
import pickle
import pprint
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.base import BaseEstimator
from permetrics import RegressionMetric, ClassificationMetric
from probnet.helpers import validator
from probnet.helpers import kernel as kernel_module
from probnet.helpers import distance as distance_module
from probnet.helpers.metrics import get_all_regression_metrics, get_all_classification_metrics


[docs]class BaseNet(BaseEstimator):
    """
    Base class for neural network models. Inherits from `BaseEstimator` to integrate with scikit-learn pipelines.

    Attributes
    ----------
    SUPPORTED_CLS_METRICS : dict
        Dictionary of supported classification metrics.
    SUPPORTED_REG_METRICS : dict
        Dictionary of supported regression metrics.
    CLS_OBJ_LOSSES : dict
        Dictionary of classification objective losses.
    SUPPORTED_KERNELS : list
        List of supported kernel functions.
    SUPPORTED_METRICS : list
        List of supported distance metrics.

    Parameters
    ----------
    sigma : float, default=1.0
        The bandwidth parameter for the kernel function.
    kernel : str, default='gaussian'
        The kernel function to use.
    dist : str, default='euclidean'
        The distance metric to use.
    kwargs : dict
        Additional keyword arguments for customization.
    """

    SUPPORTED_CLS_METRICS = get_all_classification_metrics()
    SUPPORTED_REG_METRICS = get_all_regression_metrics()
    CLS_OBJ_LOSSES = {}

    SUPPORTED_KERNELS = ["gaussian", "laplace", "cauchy", "epanechnikov", "uniform",
                         "triangular", "quartic", "cosine", "logistic", "sigmoid",
                         "multiquadric", "inverse_multiquadric", "rational_quadratic",
                         "exponential", "power", "linear", "bessel", "vonmises", "vonmises_fisher"]
    SUPPORTED_METRICS = ['euclidean', 'manhattan', "chebyshev", "minkowski", "hamming", "canberra",
                         "braycurtis", "jaccard", "sokalmichener", "sokalsneath", "russellrao",
                         "yule", "kulsinski", "rogers_tanimoto", "kulczynski", "morisita", "morisita_horn",
                         "dice", "kappa", "rogers", "jensen", "jensen_shannon", "hellinger",
                         "bhattacharyya", "cityblock", "cosin", "correlation", "mahalanobis"]

    def __init__(self, sigma=1.0, kernel='gaussian', dist='euclidean', **kwargs):
        """
        Initialize the BaseNet class.

        Parameters
        ----------
        sigma : float, default=1.0
            The bandwidth parameter for the kernel function.
        kernel : str, default='gaussian'
            The kernel function to use.
        dist : str, default='euclidean'
            The distance metric to use.
        kwargs : dict
            Additional keyword arguments for customization.
        """
        super().__init__()
        self.sigma = sigma
        self.set_kernel(kernel)
        self.set_dist(dist)
        self.kwargs = kwargs
        self.n_labels = None

[docs]    def set_kernel(self, kernel):
        """
        Set the kernel function.

        Parameters
        ----------
        kernel : str
            The kernel function to use ('gaussian', 'laplace', 'cauchy', 'epanechnikov',...).
        """
        self.kernel = validator.check_str("kernel", kernel, self.SUPPORTED_KERNELS)
        self.kernel_func = getattr(kernel_module, f"{self.kernel}_kernel")

[docs]    def set_dist(self, dist):
        """
        Set the distance metric.

        Parameters
        ----------
        dist : str
            The distance metric to use ('euclidean', 'manhattan',...).
        """
        self.dist = validator.check_str("dist", dist, self.SUPPORTED_METRICS)
        self.dist_func = getattr(distance_module, f"{self.dist}_distance")

    def __repr__(self, **kwargs):
        """Pretty-print parameters like scikit-learn's Estimator.
        """
        param_order = list(inspect.signature(self.__init__).parameters.keys())
        param_dict = {k: getattr(self, k) for k in param_order}

        param_str = ", ".join(f"{k}={repr(v)}" for k, v in param_dict.items())
        if len(param_str) <= 80:
            return f"{self.__class__.__name__}({param_str})"
        else:
            formatted_params = ",\n  ".join(f"{k}={pprint.pformat(v)}" for k, v in param_dict.items())
            return f"{self.__class__.__name__}(\n  {formatted_params}\n)"

[docs]    def fit(self, X, y):
        """
        Fit the model to the training data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Training data.
        y : array-like of shape (n_samples,)
            Target values.
        """
        pass

[docs]    def predict(self, X):
        """
        Predict target values for the given input data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data.

        Returns
        -------
        array-like
            Predicted target values.
        """
        pass

[docs]    def predict_proba(self, X):
        """
        Predict class probabilities for the given input data.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data.

        Returns
        -------
        array-like
            Predicted class probabilities.
        """
        pass

    def __evaluate_reg(self, y_true, y_pred, list_metrics=("MSE", "MAE")):
        """
        Parameters
        ----------
        y_true : array-like
            Ground truth (correct) target values.
        y_pred : array-like
            Estimated target values.
        list_metrics : tuple/list of str, optional
            List of metric names to evaluate. Default is ("MSE", "MAE").
        """
        rm = RegressionMetric(y_true=y_true, y_pred=y_pred)
        return rm.get_metrics_by_list_names(list_metrics)

    def __evaluate_cls(self, y_true, y_pred, list_metrics=("AS", "RS")):
        """
        Parameters
        ----------
        y_true : array-like of shape (n_samples,)
            True class labels.
        y_pred : array-like of shape (n_samples,)
            Predicted class labels by the classifier.
        list_metrics : tuple/list of str, optional
            List of metric names to evaluate, by default ("AS", "RS").
        """
        cm = ClassificationMetric(y_true, y_pred)
        return cm.get_metrics_by_list_names(list_metrics)

    def __score_reg(self, X, y, metric="RMSE"):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples used for prediction.

        y : array-like of shape (n_samples,)
            The true target values.

        method : str, optional, default="RMSE"
            The regression metric to be used for scoring. Must be one of the supported metrics in SUPPORTED_REG_METRICS.

        Returns
        -------
        float
            The calculated regression metric based on the method provided.
        """
        y_pred = self.predict(X)
        return RegressionMetric(y, y_pred).get_metric_by_name(metric)[metric]

    def __scores_reg(self, X, y, list_metrics=("MSE", "MAE")):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Input data.

        y : array-like of shape (n_samples,)
            True values for X.

        list_metrics : tuple of str, optional
            List of evaluation metrics to be used. Default is ("MSE", "MAE").
        """
        y_pred = self.predict(X)
        return self.__evaluate_reg(y_true=y, y_pred=y_pred, list_metrics=list_metrics)

    def __score_cls(self, X, y, metric="AS"):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Test samples to score.

        y : array-like of shape (n_samples,)
            True labels for X.

        method : str, default="AS"
            Scoring method to use. Supported methods are determined by the keys in self.SUPPORTED_CLS_METRICS.

        Returns
        -------
        float
            Computed score based on the specified method.
        """
        metric = validator.check_str("metric", metric, list(self.SUPPORTED_CLS_METRICS.keys()))
        return_prob = False
        if self.n_labels > 2:
            if metric in self.CLS_OBJ_LOSSES:
                return_prob = True
        if return_prob:
            y_pred = self.predict_proba(X)
        else:
            y_pred = self.predict(X)
        cm = ClassificationMetric(y_true=y, y_pred=y_pred)
        return cm.get_metric_by_name(metric)[metric]

    def __scores_cls(self, X, y, list_metrics=("AS", "RS")):
        """
        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            Feature matrix for the samples for which predictions are to be made.

        y : array-like of shape (n_samples,)
            True labels for the samples.

        list_metrics : tuple of str, optional
            List of method names to evaluate. Possible values include 'AS', 'RS', etc. Default is ('AS', 'RS').

        Returns
        -------
        dict
            A dictionary with the performance metrics from the selected methods listed in `list_metrics`.
        """
        list_errors = list(set(list_metrics) & set(self.CLS_OBJ_LOSSES))
        list_scores = list((set(self.SUPPORTED_CLS_METRICS.keys()) - set(self.CLS_OBJ_LOSSES)) & set(list_metrics))
        t1 = {}
        if len(list_errors) > 0:
            return_prob = False
            if self.n_labels > 2:
                return_prob = True
            if return_prob:
                y_pred = self.predict_proba(X)
            else:
                y_pred = self.predict(X)
            t1 = self.__evaluate_cls(y_true=y, y_pred=y_pred, list_metrics=list_errors)
        y_pred = self.predict(X)
        t2 = self.__evaluate_cls(y_true=y, y_pred=y_pred, list_metrics=list_scores)
        return {**t2, **t1}

[docs]    def score(self, X, y):
        """Default interface for score function"""
        pass

[docs]    def scores(self, X, y, list_metrics=None):
        """Return the list of metrics of the prediction."""
        pass

[docs]    def evaluate(self, y_true, y_pred, list_metrics=None):
        """Return the list of performance metrics of the prediction."""
        pass

[docs]    def save_metrics(self, y_true, y_pred, list_metrics=("RMSE", "MAE"), save_path="history", filename="metrics.csv"):
        """
        Save evaluation metrics to csv file

        Parameters
        ----------
        y_true : ground truth data
        y_pred : predicted output
        list_metrics : list of evaluation metrics
        save_path : saved path (relative path, consider from current executed script path)
        filename : name of the file, needs to have ".csv" extension
        """
        Path(save_path).mkdir(parents=True, exist_ok=True)
        results = self.evaluate(y_true, y_pred, list_metrics)
        df = pd.DataFrame.from_dict(results, orient='index').T
        df.to_csv(f"{save_path}/{filename}", index=False)

[docs]    def save_y_predicted(self, X, y_true, save_path="history", filename="y_predicted.csv"):
        """
        Save the predicted results to csv file

        Parameters
        ----------
        X : The features data, nd.ndarray
        y_true : The ground truth data
        save_path : saved path (relative path, consider from current executed script path)
        filename : name of the file, needs to have ".csv" extension
        """
        Path(save_path).mkdir(parents=True, exist_ok=True)
        y_pred = self.predict(X)
        data = {"y_true": np.squeeze(np.asarray(y_true)), "y_pred": np.squeeze(np.asarray(y_pred))}
        pd.DataFrame(data).to_csv(f"{save_path}/{filename}", index=False)

[docs]    def save_model(self, save_path="history", filename="model.pkl"):
        """
        Save model to pickle file

        Parameters
        ----------
        save_path : saved path (relative path, consider from current executed script path)
        filename : name of the file, needs to have ".pkl" extension
        """
        Path(save_path).mkdir(parents=True, exist_ok=True)
        if filename[-4:] != ".pkl":
            filename += ".pkl"
        pickle.dump(self, open(f"{save_path}/{filename}", 'wb'))

[docs]    @staticmethod
    def load_model(load_path="history", filename="model.pkl"):
        """
        Parameters
        ----------
        load_path : str, optional
            Directory path where the model file is located. Defaults to "history".
        filename : str
            Name of the file to be loaded. If the filename doesn't end with ".pkl", the extension is automatically added.

        Returns
        -------
        object
            The model loaded from the specified pickle file.
        """
        if filename[-4:] != ".pkl":
            filename += ".pkl"
        return pickle.load(open(f"{load_path}/{filename}", 'rb'))