-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathNormalize.py
More file actions
81 lines (62 loc) · 2.4 KB
/
Normalize.py
File metadata and controls
81 lines (62 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import yaml
import numpy as np
import xarray as xr
import pandas as pd
from pathlib import Path
from Dataset import get_station_features, get_station_target
# ── fit poly and accumulate mean/var for input features ──────────────
def fit_X_scaler(train_idx, static_ds, time_indicators, variable, poly):
"""
Pass over training stations.
Fits PolynomialFeatures on first station, then accumulates
mean and variance of poly-transformed features incrementally.
Returns mean_, var_ of shape (p,).
"""
EPS = 1e-8
mean_ = None
var_ = None
n_ = 0
for count, i in enumerate(train_idx):
X_i = get_station_features(i, static_ds, time_indicators, variable)
X_i_poly = poly.fit_transform(X_i) if count == 0 else poly.transform(X_i)
n_i = X_i_poly.shape[0]
if mean_ is None:
p = X_i_poly.shape[1]
mean_ = np.zeros(p)
var_ = np.zeros(p)
# Welford-style incremental mean and sum of squares
mean_ += X_i_poly.sum(axis=0)
var_ += (X_i_poly ** 2).sum(axis=0)
n_ += n_i
if (count + 1) % 100 == 0:
print(f'Fit X scaler: {count + 1}/{len(train_idx)} stations', flush=True)
mean_ /= n_
var_ = var_ / n_ - mean_ ** 2 # E[x^2] - E[x]^2
var_ = np.maximum(var_, EPS) # numerical safety
return mean_, var_
# ── fit accumulate mean/var for target ──────────────────────────────
def fit_y_scaler(train_idx, era5_ds, madis_ds, variable):
"""
Pass over training stations.
Accumulates mean and variance of target features incrementally.
Returns mean_, var_ of shape (1,).
"""
EPS = 1e-8
mean_ = None
var_ = None
n_ = 0
for count, i in enumerate(train_idx):
y_i = get_station_target(i, era5_ds, madis_ds, variable) # (n_time,)
n_i = y_i.shape[0]
if mean_ is None:
mean_ = np.zeros(1)
var_ = np.zeros(1)
mean_ += y_i.sum()
var_ += (y_i ** 2).sum()
n_ += n_i
if (count + 1) % 100 == 0:
print(f'Fit y scaler: {count + 1}/{len(train_idx)} stations', flush=True)
mean_ /= n_
var_ = var_ / n_ - mean_ ** 2 # E[x^2] - E[x]^2
var_ = np.maximum(var_, EPS) # numerical safety
return mean_, var_