16 Deforestation detection – Earth Observation Datascience

16.1 Imports

import geopandas as gpd
import hvplot.pandas
import hvplot.xarray  # noqa
import numpy as np
import pandas as pd
import xarray as xr

from envrs.download_path import make_url

16.2 Read the data

# Read the cube
cube_uri = make_url("HLS_clip4plots_both_b30_v20.zarr.zip", is_zip=True)

full_cube = xr.open_dataset(
    cube_uri,
    engine="zarr",
    consolidated=False,
).compute()

# Read the points, reproject tro match the datacube, set the index
points_uri = make_url("timeline_points.geojson")
points = (
    gpd.read_file(points_uri)
    .to_crs(full_cube["spatial_ref"].attrs["crs_wkt"])
    .set_index("intact")
)

https://git.geo.tuwien.ac.at/api/v4/projects/1266/repository/files/HLS_clip4plots_both_b30_v20.zarr.zip/raw?ref=main&lfs=true
https://git.geo.tuwien.ac.at/api/v4/projects/1266/repository/files/timeline_points.geojson/raw?ref=main&lfs=true

16.3 Specify the variables indicating cloud/shadow presence

cirrus_col = "cirrus cloud"
cloud_col = "cloud"
adjacent_col = "adjacent to cloud"
shadow_col = "cloud shadow"
tainted_cols = [adjacent_col, cloud_col, cirrus_col, shadow_col]

16.4 Select the clearest observations

tainted_frame = (
    full_cube[tainted_cols]
    .to_dataarray(dim="mask")
    .any(dim="mask")
    .sum(dim=["x", "y"])
    .to_dataframe(name="count_tainted")
    .sort_values(by="count_tainted")
)

monthly_clearest = tainted_frame.groupby(pd.Grouper(freq="ME")).head(1)
monthly_cube = full_cube.sel(time=monthly_clearest.index.values)

fully_clear = tainted_frame[tainted_frame["count_tainted"] == 0]
clear_cube = full_cube.sel(time=fully_clear.index.values)

16.5 How an image time series looks like

16.5.1 Images need to be converted to 8 bits to set the color stretch

def to_rgb8(cube, r, g, b, vmax, vmin=0):
    selected = cube[[r, g, b]].to_dataarray("band")
    stretched = (selected - vmin) / (vmax - vmin)

    is_positive = (stretched >= 0) & np.isfinite(stretched)
    positive = stretched.where(is_positive.all(dim="band"), 0)
    clipped = (
        np.clip(255 * positive, 0, 255)
        .astype(np.uint8)
        .expand_dims({"composite": [f"{r=}, {g=}, {b=}"]})
    )
    clipped["band"] = np.array(["r", "g", "b"], dtype="unicode")

    return clipped


def plot_rgb(*args, dimname="composite"):
    return xr.concat(args, dim=dimname).hvplot.rgb(
        x="x",
        y="y",
        bands="band",
        by=dimname,
        groupby="time",
        subplots=True,
        rasterize=True,
        data_aspect=1,
        xaxis=False,
        yaxis=None,
        widget_location="bottom",
    )

16.6 Plot a time series of the images

plot_rgb(
    to_rgb8(monthly_cube, r="Red", g="Green", b="Blue", vmax=0.15),
    to_rgb8(monthly_cube, r="NIRnarrow", g="SWIR1", b="Red", vmax=0.40),
)

16.7 Plot just the “fully” clear images

plot_rgb(
    to_rgb8(clear_cube, r="Red", g="Green", b="Blue", vmax=0.15),
    to_rgb8(clear_cube, r="NIRnarrow", g="SWIR1", b="Red", vmax=0.40),
)

16.8 Satellite pixel time series as tables

16.8.1 Pick two points one that has been deforested, and one that is intact

# https://tutorial.xarray.dev/intermediate/indexing/advanced-indexing.html
# #orthogonal-indexing-in-xarray
sel_cube = full_cube.sel(
    x=xr.DataArray(points.geometry.x, dims="intact"),
    y=xr.DataArray(points.geometry.y, dims="intact"),
    method="nearest",
)

16.8.2 Flatten (xarray dataset/“cube” to pandas dataframe/table)

sel_frame = (
    sel_cube.drop_vars("spatial_ref")
    .to_dataframe()
    .drop(columns=["x", "y"])
    .reset_index()
)

16.8.3 Prepare auxiliary variables for plotting

def normalized_difference(frame, positive, negative):
    numerator = frame[positive] - frame[negative]
    denominator = frame[positive] + frame[negative]
    return numerator / denominator


sel_frame["NDVI"] = normalized_difference(sel_frame, "NIRnarrow", "Red")
sel_frame["NDMI"] = normalized_difference(sel_frame, "NIRnarrow", "SWIR1")
sel_frame["DOY"] = sel_frame["time"].dt.dayofyear

16.8.4 Split the table into intact and deforested

deforested_frame = sel_frame[~sel_frame["intact"]]
intact_frame = sel_frame[sel_frame["intact"]]

16.9 Clouds and shadows taint our time series

intact_frame.loc[:, "flag"] = "clear"
intact_frame.loc[intact_frame[tainted_cols[1:]].any(axis=1), "flag"] = "cloud/shadow"
intact_frame.loc[intact_frame[tainted_cols[0]], "flag"] = "adjacent"


intact_frame.hvplot.scatter(
    x="time", y="Green", by="flag", color=["green", "black", "orange"]
)

/tmp/ipykernel_2982/3467101171.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  intact_frame.loc[:, "flag"] = "clear"

16.10 Filter out flagged observations

intact_masked = intact_frame[~intact_frame[tainted_cols].any(axis=1)].copy()
deforested_masked = deforested_frame[~deforested_frame[tainted_cols].any(axis=1)].copy()

16.11 Removed the cloud, the timeline becomes clearer

tetracolor_kwargs = {
    "x": "time",
    "y": ["Blue", "Green", "Red", "NIRnarrow"],
    "color": ["blue", "green", "red", "darkgray"],
}

intact_masked.hvplot(**tetracolor_kwargs)  # .legend(loc="upper left", ncols=4);

16.12 Spikes could be unmasked clouds/shadows

A massive value difference respective to its neighbors indicates the presence of possible outliers

def despike(frame, columns, min_spike, max_spike):
    summed = frame[columns].sum(axis=1)

    # Perform the selections
    central = summed.iloc[1:-1]
    prior = summed.shift(-1).iloc[1:-1]
    posterior = summed.shift(1).iloc[1:-1]

    # remove observations based on their saliency respective to their neighbors
    spikyness = central - (prior + posterior) / 2
    floor, ceiling = spikyness.quantile((min_spike, max_spike))
    selected = central[spikyness.between(floor, ceiling)]

    return frame.loc[selected.index]


cutoff = 0.05
band_names = ["Blue", "Green", "Red", "NIRnarrow", "SWIR1", "SWIR2"]

intact_despiked = despike(intact_masked, band_names, cutoff, 1 - cutoff)
deforested_despiked = despike(deforested_masked, band_names, cutoff, 1 - cutoff)

(
    intact_masked.hvplot(x="time", y="NIRnarrow", color="k")
    * intact_despiked.hvplot(x="time", y="NIRnarrow", color="darkgray")
)

16.13 The spikes as anomalies

These spikes are anolalies on the context where they appear, but they may or may not be global outliers when compared with the full population.

spike_frame = intact_masked.iloc[1:-1].drop(index=intact_despiked.index)

# DOY = day of the year
(
    intact_masked.hvplot.scatter(x="Red", y="NIRnarrow", c="DOY", colormap="twilight")
    * spike_frame.hvplot.scatter(x="Red", y="NIRnarrow", marker="x", s=90, color="red")
)

The isolated points far above and below the general population would be global outliers, whereas the rest were outliers on their specific context.

16.14 The signature of deforestation

intact_despiked.hvplot(**tetracolor_kwargs)

deforested_despiked.hvplot(**tetracolor_kwargs)

16.15 With Indices

compare_frame = (
    pd.concat({"deforested": deforested_despiked, "intact": intact_despiked}, axis=0)
    .reset_index(names=["history", "index"])
    .drop(columns="index")
)

compare_frame.hvplot.line(
    x="time", y="NDVI", by="history", color=["black", "limegreen"]
)