PyFacts/pyfacts/utils.py

from __future__ import annotations

import datetime
import statistics
from dataclasses import dataclass
from typing import List, Literal, Mapping, Sequence, Tuple

from dateutil.relativedelta import relativedelta

from .exceptions import DateNotFoundError, DateOutOfRangeError


@dataclass
class PyfactsOptions:
    date_format: str = "%Y-%m-%d"
    closest: str = "previous"  # next
    traded_days: int = 365
    get_closest: str = "exact"


def _parse_date(date: str, date_format: str = None) -> datetime.datetime:
    """Parses date and handles errors

    Parameters:
    -----------
    date: str | datetime.date
        The date to be parsed.
        If the date passed is already a datetime object, it will return it unprocessed.

    date_format: str, default None
        The format of the date string in datetime.strftime friendly format.
        If format is None, format in FincalOptions.date_format will be used.

    Returns:
    --------
        Returns a datetime.datetime object.

    Raises:
    -------
        TypeError: If the is not a date-like string
        ValueError: If the date could not be parsed with the given format
    """

    if isinstance(date, (datetime.datetime, datetime.date)):
        return datetime.datetime.fromordinal(date.toordinal())

    if date_format is None:
        date_format = PyfactsOptions.date_format

    try:
        date = datetime.datetime.strptime(date, date_format)
    except TypeError:
        raise ValueError("Date does not seem to be valid date-like string")
    except ValueError:
        raise ValueError("Date could not be parsed. Have you set the correct date format in FincalOptions.date_format?")
    return date


def _preprocess_timeseries(
    data: Sequence[Tuple[str | datetime.datetime, float]]
    | Sequence[Mapping[str | datetime.datetime, float]]
    | Mapping[str | datetime.datetime, float],
    date_format: str,
) -> List[Tuple[datetime.datetime, float]]:
    """Converts any type of list to the TimeSeries friendly format.
        This function is internally called by the __init__ function of the TimeSeriesCore class

        The TimeSeries class can internally process a list of Tuples.
        However, users have the option of passing a variety of types.
        This function preprocesses the data and converts it into the relevant format.

        If the data is a dictionary, it will be converted using .items() iteration.
        If the data is not a dictionary or a list, it will raise an error.
        If the data is of list type:
            * If the first item is also of list type, it will be parsed as a list of lists
            * If the first item is a dictionary with one key, then key will be parsed as date
            * If the first item is a dictionary with two keys, then first key will be date and second will be value
            * If the first element is of another type, it will raise an error

        The final return value is sorted by date

    Parameters:
    -----------
    Data:
        The data for the time series. Can be a dictionary, a list of tuples, or a list of dictionaries.

    date_format: str
        The format of the date in strftime friendly format.

    Returns:
    -----------
        Returns a list of Tuples where the first element of each tuple is of datetime.datetime class
        and the second element is of float class

    Raises:
    --------
    TypeError: If the data is not in a format which can be parsed.
    """

    if isinstance(data, Mapping):
        current_data: List[tuple] = [(k, v) for k, v in data.items()]
        return _preprocess_timeseries(current_data, date_format)

    # If data is not a dictionary or list, it cannot be parsed
    if not isinstance(data, Sequence):
        raise TypeError("Could not parse the data")

    if isinstance(data[0], Sequence):
        return sorted([(_parse_date(i, date_format), float(j)) for i, j in data])

    # If first element is not a dictionary or tuple, it cannot be parsed
    if not isinstance(data[0], Mapping):
        raise TypeError("Could not parse the data")

    if len(data[0]) == 1:
        current_data: List[tuple] = [tuple(*i.items()) for i in data]
    elif len(data[0]) == 2:
        current_data: List[tuple] = [tuple(i.values()) for i in data]
    else:
        raise TypeError("Could not parse the data")
    return _preprocess_timeseries(current_data, date_format)


def _preprocess_match_options(as_on_match: str, prior_match: str, closest: str) -> Tuple[datetime.timedelta]:
    """Checks the arguments and returns appropriate timedelta objects"""

    deltas = {"exact": 0, "previous": -1, "next": 1}
    if closest not in deltas.keys():
        raise ValueError(f"Invalid argument for closest: {closest}")

    as_on_match: str = closest if as_on_match == "closest" else as_on_match
    prior_match: str = closest if prior_match == "closest" else prior_match

    if as_on_match in deltas.keys():
        as_on_delta: datetime.timedelta = datetime.timedelta(days=deltas[as_on_match])
    else:
        raise ValueError(f"Invalid as_on_match argument: {as_on_match}")

    if prior_match in deltas.keys():
        prior_delta: datetime.timedelta = datetime.timedelta(days=deltas[prior_match])
    else:
        raise ValueError(f"Invalid prior_match argument: {prior_match}")

    return as_on_delta, prior_delta


def _preprocess_from_to_date(
    from_date: datetime.date | str,
    to_date: datetime.date | str,
    time_series: Mapping = None,
    align_dates: bool = True,
    return_period_unit: Literal["years", "months", "days"] = None,
    return_period_value: int = None,
    as_on_match: str = "closest",
    prior_match: str = "closest",
    closest: Literal["previous", "next", "exact"] = "previous",
) -> tuple:

    as_on_match, prior_match = _preprocess_match_options(as_on_match, prior_match, closest)

    if (from_date is None or to_date is None) and time_series is None:
        raise ValueError("Provide either to_date and from_date or time_series data")

    if time_series is not None and (return_period_unit is None or return_period_value is None):
        raise ValueError("Provide return period for calculation of from_date")

    if from_date is None:
        expected_start_date = time_series.start_date + relativedelta(**{return_period_unit: return_period_value})
        from_date = _find_closest_date(time_series, expected_start_date, 999, as_on_match, "fail")[0]

    if to_date is None:
        to_date = time_series.end_date

    return from_date, to_date


def _find_closest_date(
    data: Mapping[datetime.datetime, float],
    date: datetime.datetime,
    limit_days: int,
    delta: datetime.timedelta,
    if_not_found: Literal["fail", "nan"],
) -> Tuple[datetime.datetime, float]:
    """Helper function to find data for the closest available date

    data:
        TimeSeries data
    """

    if delta.days < 0 and date < min(data.data):
        if if_not_found == "nan":
            return float("NaN"), float("NaN")
        else:
            raise DateOutOfRangeError(date, "min")
    if delta.days > 0 and date > max(data.data):
        if if_not_found == "nan":
            return float("NaN"), float("NaN")
        else:
            raise DateOutOfRangeError(date, "max")

    row: tuple = data.get(date, None)
    if row is not None:
        return row

    if delta and limit_days != 0:
        return _find_closest_date(data, date + delta, limit_days - 1, delta, if_not_found)

    if if_not_found == "fail":
        raise DateNotFoundError("Data not found for date", date)
    if if_not_found == "nan":
        return date, float("NaN")

    raise ValueError(f"Invalid argument for if_not_found: {if_not_found}")


def _interval_to_years(interval_type: Literal["years", "months", "day"], interval_value: int) -> float:
    """Converts any time period to years for use with compounding functions"""

    year_conversion_factor: dict = {"years": 1, "months": 12, "days": 365}
    years: float = interval_value / year_conversion_factor[interval_type]
    return years


def _is_eomonth(dates: Sequence[datetime.datetime], threshold: float = 0.7):
    """Checks if a series is should be treated as end of month date series or not.

    If eomonth dates exceed threshold percentage, it will be treated as eomonth series.
    This can be used for any frequency, but will work only for monthly and lower frequencies.
    """
    eomonth_dates = [date.month != (date + relativedelta(days=1)).month for date in dates]
    eomonth_proportion = sum(eomonth_dates) / len(dates)
    return eomonth_proportion > threshold


def covariance(series1: list, series2: list) -> float:
    """Returns the covariance of two series

        This is a compatibility function for Python versions prior to 3.10.
        It will be replaced with statistics.covariance when support is dropped for versions <3.10.

    Parameters
    ----------
    series1 : List
        A list of numbers
    series2 : list
        A list of numbers

    Returns
    -------
    float
        Returns the covariance as a float value
    """

    n = len(series1)
    if len(series2) != n:
        raise ValueError("Lenght of both series must be same for covariance calcualtion.")
    if n < 2:
        raise ValueError("At least two data poitns are required for covariance calculation.")

    mean1 = statistics.mean(series1)
    mean2 = statistics.mean(series2)

    xy = sum([(x - mean1) * (y - mean2) for x, y in zip(series1, series2)])

    return xy / n