Frequency validation and automatic assignment during creation

modified tests to account for the changes.
This commit is contained in:
Gourav Kumar 2022-06-05 12:53:40 +05:30
parent a6fcd29a34
commit c605f71f10
3 changed files with 102 additions and 12 deletions

View File

@ -6,7 +6,17 @@ import warnings
from collections import UserList from collections import UserList
from dataclasses import dataclass from dataclasses import dataclass
from numbers import Number from numbers import Number
from typing import Any, Callable, Iterable, List, Literal, Mapping, Sequence, Type from typing import (
Any,
Callable,
Iterable,
List,
Literal,
Mapping,
Sequence,
Tuple,
Type,
)
from dateutil.relativedelta import relativedelta from dateutil.relativedelta import relativedelta
@ -289,6 +299,51 @@ class Series(UserList):
return self.__class__([i + other for i in self], self.dtype.__name__) return self.__class__([i + other for i in self], self.dtype.__name__)
def _validate_frequency(
data: List[Tuple[datetime.datetime, float]], provided_frequency: Literal["D", "W", "M", "Q", "H", "Y"] = None
):
"""Checks the data and returns the expected frequency."""
if provided_frequency is not None:
provided_frequency = getattr(AllFrequencies, provided_frequency)
start_date = data[0][0]
end_date = data[-1][0]
overall_gap = (end_date - start_date).days
num_data_points = len(data)
# days_per_data = num_data_points / overall_gap
expected_data_points = {
"D": (round(overall_gap * 0.6, 0), round(overall_gap * 1.05 + 1, 0)),
"W": (round(overall_gap / 7 * 0.7, 0), round(overall_gap / 7 * 1.05 + 1, 0)),
"M": (round(overall_gap / 30 * 0.8, 0), round(overall_gap / 30 * 1.05 + 1, 0)),
"Q": (round(overall_gap / 92 * 0.85, 0), round(overall_gap / 92 * 1.05 + 1, 0)),
"H": (round(overall_gap / 182 * 0.85, 0), round(overall_gap / 182 * 1.05 + 1, 0)),
"A": (round(overall_gap / 365 * 0.85, 0), round(overall_gap / 365 * 1.05 + 1, 0)),
}
for frequency, (min, max) in expected_data_points.items():
if min <= num_data_points <= max:
expected_frequency = frequency
break
else:
raise ValueError("Data does not match any known frequency. Perhaps you have too many missing data points.")
expected_data_points = expected_data_points[expected_frequency]
if provided_frequency is None:
frequency_match = None
elif provided_frequency.symbol == expected_frequency:
frequency_match = True
else:
frequency_match = False
return {
"gap": overall_gap,
"expected_data_points": expected_data_points,
"actual_data_points": num_data_points,
"expected_frequency": expected_frequency,
"frequency_match": frequency_match,
}
@Mapping.register @Mapping.register
class TimeSeriesCore: class TimeSeriesCore:
"""Defines the core building blocks of a TimeSeries object""" """Defines the core building blocks of a TimeSeries object"""
@ -296,7 +351,8 @@ class TimeSeriesCore:
def __init__( def __init__(
self, self,
ts_data: List[Iterable] | Mapping, ts_data: List[Iterable] | Mapping,
frequency: Literal["D", "W", "M", "Q", "H", "Y"], frequency: Literal["D", "W", "M", "Q", "H", "Y"] = None,
validate_frequency: bool = True,
date_format: str = "%Y-%m-%d", date_format: str = "%Y-%m-%d",
): ):
"""Instantiate a TimeSeriesCore object """Instantiate a TimeSeriesCore object
@ -308,9 +364,21 @@ class TimeSeriesCore:
The first element of each tuple should be a date and second element should be a value. The first element of each tuple should be a date and second element should be a value.
In case of dictionary, the key should be the date. In case of dictionary, the key should be the date.
frequency : str frequency : str, optional
The frequency of the time series. The frequency of the time series.
Valid values are {D, W, M, Q, H, Y} Valid values are {D, W, M, Q, H, Y}
If no frequency is provided, it will be inferred from the data.
Frequency assignment uses approximation and hence the assignment may be incorrect if
there are fewer than 12 data points.
validate_frequency: boolean, default True
Whether the provided frequency should be validated against the data.
When set to True, if the expected number of data points are not withint the expected limits,
it will raise an Exception and object creation will fail.
Validation is performed only if data contains at least 12 data points, as a fewer number of
data points are not sufficient to determine the frequency correctly.
This parameter will be ignored if frequency is not provided.
refer core._validate_frequency for more details.
date_format : str, optional, default "%Y-%m-%d" date_format : str, optional, default "%Y-%m-%d"
Specify the format of the date Specify the format of the date
@ -319,10 +387,24 @@ class TimeSeriesCore:
ts_data = _preprocess_timeseries(ts_data, date_format=date_format) ts_data = _preprocess_timeseries(ts_data, date_format=date_format)
validation = _validate_frequency(data=ts_data, provided_frequency=frequency)
if frequency is None:
frequency = validation["expected_frequency"]
self.frequency = getattr(AllFrequencies, frequency)
if validate_frequency and len(ts_data) >= 12:
if validation["frequency_match"] is not None and not validation["frequency_match"]:
raise ValueError(
f"Data appears to be of frquency {validation['expected_frequency']!r}, "
f"but {frequency!r} was provided. Pass the correct frequency."
"\nPass validate_frequency=False to disable this validation."
)
self.data = dict(ts_data) self.data = dict(ts_data)
if len(self.data) != len(ts_data): if len(self.data) != len(ts_data):
warnings.warn("The input data contains duplicate dates which have been ignored.") warnings.warn("The input data contains duplicate dates which have been ignored.")
self.frequency: Frequency = getattr(AllFrequencies, frequency) # self.frequency: Frequency = getattr(AllFrequencies, frequency)
self.iter_num: int = -1 self.iter_num: int = -1
self._dates: list = None self._dates: list = None
self._values: list = None self._values: list = None

View File

@ -118,26 +118,34 @@ class TimeSeries(TimeSeriesCore):
* List of dictionaries with 2 keys, first representing date & second representing value * List of dictionaries with 2 keys, first representing date & second representing value
* Dictionary of key: value pairs * Dictionary of key: value pairs
date_format : str, optional, default "%Y-%m-%d"
Specify the format of the date
Required only if the first argument of tuples is a string. Otherwise ignored.
frequency : str, optional, default "infer" frequency : str, optional, default "infer"
The frequency of the time series. Default is infer. The frequency of the time series. Default is infer.
The class will try to infer the frequency automatically and adjust to the closest member. The class will try to infer the frequency automatically and adjust to the closest member.
Note that inferring frequencies can fail if the data is too irregular. Note that inferring frequencies can fail if the data is too irregular.
Valid values are {D, W, M, Q, H, Y} Valid values are {D, W, M, Q, H, Y}
validate_frequency: boolean, default True
Whether the provided frequency should be validated against the data.
When set to True, if the expected number of data points are not withint the expected limits,
it will raise an Exception and object creation will fail.
This parameter will be ignored if frequency is not provided.
refer core._validate_frequency for more details.
date_format : str, optional, default "%Y-%m-%d"
Specify the format of the date
Required only if the first argument of tuples is a string. Otherwise ignored.
""" """
def __init__( def __init__(
self, self,
data: List[Iterable] | Mapping, data: List[Iterable] | Mapping,
frequency: Literal["D", "W", "M", "Q", "H", "Y"], frequency: Literal["D", "W", "M", "Q", "H", "Y"] = None,
validate_frequency: bool = False,
date_format: str = "%Y-%m-%d", date_format: str = "%Y-%m-%d",
): ):
"""Instantiate a TimeSeriesCore object""" """Instantiate a TimeSeriesCore object"""
super().__init__(data, frequency, date_format) super().__init__(data, frequency, validate_frequency, date_format)
def info(self) -> str: def info(self) -> str:
"""Summary info about the TimeSeries object""" """Summary info about the TimeSeries object"""

View File

@ -183,7 +183,7 @@ class TestSetitem:
data = [("2021-01-01", 220), ("2021-01-04", 230), ("2021-03-07", 240)] data = [("2021-01-01", 220), ("2021-01-04", 230), ("2021-03-07", 240)]
def test_setitem(self): def test_setitem(self):
ts = TimeSeriesCore(self.data, frequency="D") ts = TimeSeriesCore(self.data, frequency="M")
assert len(ts) == 3 assert len(ts) == 3
ts["2021-01-02"] = 225 ts["2021-01-02"] = 225
@ -195,7 +195,7 @@ class TestSetitem:
assert ts["2021-01-02"][1] == 227.6 assert ts["2021-01-02"][1] == 227.6
def test_errors(self): def test_errors(self):
ts = TimeSeriesCore(self.data, frequency="D") ts = TimeSeriesCore(self.data, frequency="M")
with pytest.raises(TypeError): with pytest.raises(TypeError):
ts["2021-01-03"] = "abc" ts["2021-01-03"] = "abc"