Synthetic data generation#
[1]:
import json
from itertools import islice
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import AutoMinorLocator, MultipleLocator
[2]:
from gluonts.core import serde
from gluonts.dataset.artificial import recipe as rcp
[3]:
# plotting utils
def plot_recipe(recipe, length):
output_dict = rcp.evaluate(recipe, length)
K = len(output_dict)
lct = MultipleLocator(288)
minor = AutoMinorLocator(12)
fig, axs = plt.subplots(K, 1, figsize=(16, 2 * len(recipe)))
for i, k in enumerate(output_dict):
axs[i].xaxis.set_major_locator(lct)
axs[i].xaxis.set_minor_locator(minor)
axs[i].plot(output_dict[k])
axs[i].grid()
axs[i].set_ylabel(k)
def plot_examples(target, length, num, anomaly_indicator=None):
fix, axs = plt.subplots(num, 1, figsize=(16, num * 2))
for i in range(num):
xx = rcp.evaluate(
dict(target=target, anomaly_indicator=anomaly_indicator), length
)
axs[i].plot(xx["target"])
axs[i].set_ylim(0, 1.1 * np.max(xx["target"]))
axs[i].grid()
if anomaly_indicator is not None:
axs[i].fill_between(
np.arange(len(xx["target"])),
xx["anomaly_indicator"] * 1.1 * np.max(xx["target"]),
np.zeros(len(xx["target"])),
alpha=0.3,
color="red",
)
def print_dicts(*dicts):
for d in dicts:
print("{")
for k, v in d.items():
print("\t", k, ": ", v)
print("}\n")
Data Generation Recipes#
To generate realistic artificial data, we describe the data generation process through a symbolic graph (this is akin to how mxnet symbol graphs work).
Your graph can contain python values as well as operators that correspond to random variables or random processes. The output of a recipe can be a list, dictionary or a value:
[4]:
rcp.evaluate(rcp.RandomGaussian(), length=5)
[4]:
array([-0.14623032, -1.06986359, 1.05844796, 0.75534637, -1.39678759])
[5]:
rcp.evaluate({"var1": rcp.RandomGaussian(), "var2": 3.0}, length=5)
[5]:
{'var1': array([ 1.4089298 , 1.38211622, 1.7198381 , -1.48442956, -0.63099337]),
'var2': 3.0}
[6]:
rcp.evaluate([3.0, rcp.RandomUniform()], length=5)
[6]:
[3.0, array([0.19885428, 0.60489994, 0.42505934, 0.2305186 , 0.24688702])]
[7]:
recipe = dict(myOutput1=rcp.RandomGaussian())
# multiple evaluations lead to different results, due to randomness
print_dicts(
rcp.evaluate(recipe, length=5),
rcp.evaluate(recipe, length=5),
)
{
myOutput1 : [ 0.45000225 -2.2213785 -0.1528156 1.42434522 0.3139183 ]
}
{
myOutput1 : [ 0.17910279 -0.31140746 -0.2432074 -0.15257331 1.40217193]
}
Referencing variables#
Each time you create a random variable such as RandomGaussian
the variable refers to a new independent RV. You can re-use and refer to previously created random variables.
[8]:
stddev1 = 2.0
stddev2 = rcp.RandomUniform(low=0, high=1, shape=(1,))
x1 = rcp.RandomGaussian(stddev=stddev1)
x2 = rcp.RandomGaussian(stddev=stddev2)
x3 = 2 * x2
recipe = dict(x1=x1, x2=x2, x3=x3)
# multiple evaluations lead to different results, due to randomness
print_dicts(rcp.evaluate(recipe, length=5), rcp.evaluate(recipe, length=5))
{
x1 : [-4.15236783 3.33089287 -1.7650173 4.31840226 -3.24158136]
x2 : [ 0.45371147 -1.22840268 -0.091649 0.56955048 0.31684371]
x3 : [ 0.90742294 -2.45680536 -0.183298 1.13910097 0.63368743]
}
{
x1 : [ 1.21217111 -1.03711262 0.44268618 -2.64438173 -0.20533495]
x2 : [ 0.13661998 -0.12320349 -1.00964885 -1.16800566 -0.95996586]
x3 : [ 0.27323996 -0.24640699 -2.01929769 -2.33601131 -1.91993172]
}
Note that you may create and use intermediate random variables such as stddev2
in the above example without including them in the output.
[9]:
recipe = dict(random_out=rcp.RandomGaussian(shape=(1,)), fixed_out=np.random.randn(1))
# note that fixed_out stays the same;
# it's evaluated only once when the recipe is created
print_dicts(rcp.evaluate(recipe, length=1), rcp.evaluate(recipe, length=1))
{
random_out : [0.34182422]
fixed_out : [-0.8086896]
}
{
random_out : [-0.35283937]
fixed_out : [-0.8086896]
}
Length#
Most operators in the recipe
package have a length
argument that is automatically passed when the expression is evaluated. The idea is that these recipes are used to generate fixed-length time series, and most operators produce individual components of the time series that have the same length.
[10]:
recipe = dict(random_gaussian=rcp.RandomGaussian(), constant_vec=rcp.ConstantVec(42))
print_dicts(rcp.evaluate(recipe, length=3), rcp.evaluate(recipe, length=5))
{
random_gaussian : [ 0.15885871 -1.1668147 2.01667793]
constant_vec : [42. 42. 42.]
}
{
random_gaussian : [ 2.01354963 -2.28381719 0.14022704 -0.14361124 -0.95813227]
constant_vec : [42. 42. 42. 42. 42.]
}
Operator Overloading#
The operators defined in the recipe
package overload the basic arithmetic operations (addition, subtraction, multiplication, division).
[11]:
x1 = 42 * rcp.ConstantVec(1)
x2 = x1 * rcp.RandomUniform()
x3 = rcp.RandomGaussian() + rcp.RandomUniform()
result = x1 + x2 + x3
rcp.evaluate(result, 3)
[11]:
array([60.47680942, 81.13694002, 70.76948986])
SerDe#
Recipes composed of serializable / representable components can easily be serialized / deserialized.
[12]:
dumped = serde.encode(result)
print(dumped)
reconstructed = serde.decode(dumped)
rcp.evaluate(reconstructed, 3)
{'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': 42, 'op': '*', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe.ConstantVec', 'args': [], 'kwargs': {'constant': 1}}}}, 'op': '+', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': 42, 'op': '*', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe.ConstantVec', 'args': [], 'kwargs': {'constant': 1}}}}, 'op': '*', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe.RandomUniform', 'args': [], 'kwargs': {'high': 1.0, 'low': 0.0, 'shape': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'builtins.tuple', 'args': [[0]]}}}}}}}, 'op': '+', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe._LiftedBinaryOp', 'args': [], 'kwargs': {'left': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe.RandomGaussian', 'args': [], 'kwargs': {'shape': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'builtins.tuple', 'args': [[0]]}, 'stddev': 1.0}}, 'op': '+', 'right': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'gluonts.dataset.artificial.recipe.RandomUniform', 'args': [], 'kwargs': {'high': 1.0, 'low': 0.0, 'shape': {'__kind__': <Kind.Instance: 'instance'>, 'class': 'builtins.tuple', 'args': [[0]]}}}}}}}
[12]:
array([83.17622164, 77.74813707, 77.95044445])
Simple Examples#
[13]:
daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
noise = rcp.RandomGaussian(stddev=0.1)
signal = daily_smooth_seasonality + noise
recipe = dict(
daily_smooth_seasonality=daily_smooth_seasonality, noise=noise, signal=signal
)
plot_recipe(recipe, 3 * 288)
[14]:
slope = rcp.RandomUniform(low=0, high=3, shape=(1,))
trend = rcp.LinearTrend(slope=slope)
daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
noise = rcp.RandomGaussian(stddev=0.1)
signal = trend + daily_smooth_seasonality + noise
plot_examples(signal, 3 * 288, 5)
Composing Recipes#
There are many ways to combine and extend generation recipes. For example using python functions.
[15]:
def weekly_seasonal_unscaled():
daily_smooth_seasonality = rcp.SmoothSeasonality(period=288, phase=-72)
weekday_scale = rcp.RandomUniform(0.1, 10, shape=(1,))
weekly_pattern = rcp.NormalizeMax(
rcp.Concatenate([weekday_scale * np.ones(5), np.ones(2)])
)
day_of_week = rcp.Dilated(rcp.Repeated(weekly_pattern), 288)
level = rcp.RandomUniform(low=0, high=10, shape=1)
noise_level = rcp.RandomUniform(low=0.01, high=1, shape=1)
noise = noise_level * rcp.RandomGaussian()
signal = daily_smooth_seasonality * day_of_week
unscaled = level + signal + noise
return dict(
daily_smooth_seasonality=daily_smooth_seasonality,
weekday_scale=weekday_scale,
weekly_pattern=weekly_pattern,
day_of_week=day_of_week,
level=level,
noise_level=noise_level,
noise=noise,
signal=signal,
unscaled=unscaled,
)
recipe = weekly_seasonal_unscaled()
plot_recipe(recipe, 10 * 288)
plot_examples(recipe["unscaled"], 10 * 288, 5)
[16]:
def weekly_seasonal():
c = weekly_seasonal_unscaled()
unscaled = c["unscaled"]
scale = rcp.RandomUniform(low=0, high=1000, shape=1)
z = scale * unscaled
return z
plot_examples(weekly_seasonal(), 10 * 288, 5)
Here is a more complex example
[17]:
def scale(unscaled):
s = rcp.RandomUniform(low=0, high=1000, shape=1)
z = s * unscaled
return z
def complex_weekly_seasonality():
daily_pattern = rcp.RandomUniform(0, 1, shape=(24,))
daily_seasonality = rcp.Dilated(rcp.Repeated(daily_pattern), 12)
weekly_pattern = rcp.RandomUniform(0, 1, shape=(7,))
weekly_seasonality = rcp.Dilated(rcp.Repeated(weekly_pattern), 288)
unnormalized_seasonality = daily_seasonality * weekly_seasonality
seasonality = rcp.NormalizeMax(unnormalized_seasonality)
noise_level = rcp.RandomUniform(low=0.01, high=0.1, shape=1)
noise = noise_level * rcp.RandomGaussian()
level = rcp.RandomUniform(low=0, high=10, shape=1)
signal = level + seasonality
unscaled = signal + noise
return scale(unscaled)
plot_examples(complex_weekly_seasonality(), 10 * 288, 5)
Generating Anomalies#
Anomalies are just another effect added/multiplied to a base time series. We can define a recipe for creating certain types of anomalies, and then compose it with a base recipe.
[18]:
z = rcp.ConstantVec(1.0)
def inject_anomalies(z):
normal_indicator = rcp.BinaryMarkovChain(one_to_zero=1 / (288 * 7), zero_to_one=0.1)
anomaly_indicator = 1 - normal_indicator
anomaly_scale = 0.5 + rcp.RandomUniform(-1.0, 1.0, shape=1)
anomaly_multiplier = 1 + anomaly_scale * anomaly_indicator
target = z * anomaly_multiplier
return target, anomaly_indicator
target, anomaly_indicator = inject_anomalies(z)
plot_examples(target, 10 * 288, 5, anomaly_indicator)
[19]:
target, anomaly_indicator = inject_anomalies(weekly_seasonal())
plot_examples(target, 288 * 7, 5, anomaly_indicator)
Generating Changepoints#
[20]:
level = rcp.RandomUniform(0, 10, shape=1)
noise_level = rcp.RandomUniform(0.01, 1, shape=1)
noise = rcp.RandomGaussian(noise_level)
homoskedastic_gaussian_noise = level + noise
[21]:
z1 = homoskedastic_gaussian_noise
z2 = weekly_seasonal_unscaled()["unscaled"]
z_stacked = rcp.Stack([z1, z2])
change = rcp.RandomChangepoints(1)
unscaled = rcp.Choose(z_stacked, change)
target = scale(unscaled)
target, anomaly_indicator = inject_anomalies(target)
[22]:
plot_examples(target, 288 * 7, 10, anomaly_indicator)
Generating several time series#
[23]:
rcp.take_as_list(rcp.generate(10, weekly_seasonal_unscaled(), "2018-01-01", {}), 2)
[23]:
[{'daily_smooth_seasonality': array([0. , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
'weekday_scale': array([5.53325369]),
'weekly_pattern': array([1. , 1. , 1. , 1. , 1. ,
0.18072549, 0.18072549]),
'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
'level': array([7.15189366]),
'noise_level': array([0.60673574]),
'noise': array([-1.37627579, 0.80910965, -0.5113108 , 1.19522357, 0.76819937,
-0.30693338, 1.54426428, 0.65576722, 0.29384949, 0.35138523]),
'signal': array([0. , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
'unscaled': array([5.77561787, 7.9611223 , 6.64105875, 8.34818777, 7.92199568,
6.84793212, 8.70043552, 7.81348013, 7.45333928, 7.51288625]),
'item_id': '0',
'start': '2018-01-01'},
{'daily_smooth_seasonality': array([0. , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
'weekday_scale': array([8.71312027]),
'weekly_pattern': array([1. , 1. , 1. , 1. , 1. ,
0.11476945, 0.11476945]),
'day_of_week': array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]),
'level': array([9.78618342]),
'noise_level': array([0.80116698]),
'noise': array([ 1.19700682, -0.16436603, 0.2508195 , -0.6842733 , -2.04537114,
0.52365764, 0.69255774, -0.59459811, 1.81845245, -1.16518975]),
'signal': array([0. , 0.00011899, 0.00047589, 0.00107054, 0.00190265,
0.00297183, 0.00427757, 0.00581924, 0.00759612, 0.00960736]),
'unscaled': array([10.98319024, 9.62193638, 10.03747882, 9.10298066, 7.74271494,
10.31281289, 10.48301873, 9.19740456, 11.612232 , 8.63060103]),
'item_id': '1',
'start': '2018-01-01'}]
Saving to a file#
[24]:
def write_to_file(recipe, length, num_ts, fields, fn):
with open(fn, "w") as f, open(fn + "-all", "w") as g:
for x in islice(rcp.generate(length, recipe, "2019-01-07 00:00"), num_ts):
z = {}
for k in x:
if type(x[k]) == np.ndarray:
z[k] = x[k].tolist()
else:
z[k] = x[k]
xx = {}
for fi in fields:
xx[fi] = z[fi]
try:
f.write(json.dumps(xx))
except Exception as e:
print(xx)
print(z)
raise e
f.write("\n")
g.write(json.dumps(z))
g.write("\n")