Hello all
I am developing a Gaussian Process regression python package (pyeogpr) running with openEO.
The package needs to retrieve biophysical trait maps, yielding two bands: variable and uncertainty. To infer these the package uses Gaussian Process regression, with matrix/array mathematics.
However, due to the back-end chunking up the satellite image into blocks of 128, the matrix multiplication gets memory inefficient. Is there a ways to solve this? For the variable band it works smooth, however for uncertainty the calculation is more complex and the batch job fails.
Job id: j-240827f758f84022aa3e1771fdc335c1
The code seems to fail when I declare the “Uncertainty” variable. Please help
udf_cloud = openeo.UDF("""
import importlib
import os
import sys
import time
dir_path = 'tmp/venv'
sys.path.insert(0, dir_path)
python_files = [f[:-3] for f in os.listdir(dir_path) if f.endswith('.py') and f !="__init__.py"]
for lib in python_files:
globals()[lib] = __import__(lib)
from configparser import ConfigParser
from typing import Dict
from openeo.metadata import Band, CollectionMetadata
from openeo.udf import XarrayDataCube, inspect
import numpy as np
import xarray as xr
from pathlib import Path
from openeo.udf.debug import inspect
chunks = 128
def broadcaster(array):
return np.broadcast_to(array[:, np.newaxis, np.newaxis], (10, chunks, chunks))
init_xr = xr.DataArray()
init_xr_u = xr.DataArray()
def apply_datacube(cube: xarray.DataArray, context: dict) -> xarray.DataArray:
pixel_spectra = (cube.values)
sensor = context["sensor"]
variable = context["biovar"]
model = globals()[sensor + "_" + variable]
hyp_ell_GREEN = broadcaster(model.hyp_ell_GREEN)
mx_GREEN = broadcaster(model.mx_GREEN.ravel())
sx_GREEN = broadcaster(model.sx_GREEN.ravel())
XDX_pre_calc_GREEN_broadcast = np.broadcast_to(model.XDX_pre_calc_GREEN.ravel()[:,np.newaxis,np.newaxis],(model.XDX_pre_calc_GREEN.shape[0], chunks, chunks))
im_norm_ell2D_hypell = ((pixel_spectra - mx_GREEN) / sx_GREEN) * hyp_ell_GREEN
im_norm_ell2D = ((pixel_spectra - mx_GREEN) / sx_GREEN)
PtTPt = np.einsum('ijk,ijk->ijk', im_norm_ell2D_hypell, im_norm_ell2D) * (-0.5)
PtTDX = np.einsum('ij,jkl->ikl', model.X_train_GREEN,im_norm_ell2D_hypell)
arg1 = np.exp(PtTPt[0]) * model.hyp_sig_GREEN
k_star_im = np.exp(PtTDX - (XDX_pre_calc_GREEN_broadcast * (0.5)))
mean_pred = (np.einsum('ijk,i->jk',k_star_im, model.alpha_coefficients_GREEN.ravel()) * arg1) + model.mean_model_GREEN
k_star_uncert_im = np.exp(PtTDX - (XDX_pre_calc_GREEN_broadcast * (0.5))) * arg1
k_star_uncert = np.expand_dims(k_star_uncert_im, axis=0)
Vvector = np.einsum('ij,jabc->ibc', model.Linv_pre_calc_GREEN, k_star_uncert)#.reshape(-1,256,256)
#the code seems to fail here
Uncertainty = math.sqrt(abs(np.expand_dims(model.hyp_sig_GREEN, axis=0) - np.dot(Vvector,Vvector))) # this should be unc. Currently overreaching memory limits
init_xr = mean_pred
# init_xr_u = Uncertainty
returned = xr.DataArray(init_xr)
# returned = xr.DataArray(np.stack([init_xr, init_xr_u], axis=0),
# dims=["band", "x", "y"],
# coords={"band": ["Variable", "Uncertainty"]})
return returned
""",context={"from_parameter": "context"})
sensor = "SENTINEL2_L2A"
biovar = "CNC_Cab"
context = {"sensor": sensor,
"biovar":biovar}
S2_cloud = s2_cube.apply_dimension(process=udf_cloud,
dimension="bands",
context =context)#.filter_bands(bands = ["B02"])
S2_cloud_process = S2_cloud.execute_batch(
title=f"{sensor} _ {biovar}",
outputfile=f"{sensor} _ {biovar}.nc",
job_options = {
'executor-memory': '10g',
'udf-dependency-archives': [
'https://github.com/daviddkovacs/pyeogpr/raw/main/models/GPR_models_bulk.zip#tmp/venv'
]
}
)