from enum import Enum
from pathlib import Path
from typing import Any
import geopandas as gpd
import hvplot.pandas
import hvplot.xarray # noqa
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import xarray as xr
from rasterio import Affine
from rasterio.features import rasterize
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from envrs.download_path import make_url
This notebook presents a workflow for classifying forest cover in Mozambique using remote sensing data. The classification utilizes annual mosaics derived from Sentinel-2 and Landsat sources, with data organized by value quantiles for analysis.
National Geographic - https://www.nationalgeographic.com/magazine/article/mozambique-gorongosa-national-park-wildlife-rebound
17.1 Workflow Setup
We begin by importing the necessary libraries for geospatial analysis, data handling, visualization, and machine learning. These tools form the foundation for the classification workflow that follows.
int = 2018 # change to 2018, 2020 or 2024
year: = make_url(
url_dc f"HLS_T36KXE_{year}_b30_v2.zarr.zip",
=True,
is_zip
)= make_url(
url_slivers "label_features_i02.geojson",
)
https://git.geo.tuwien.ac.at/api/v4/projects/1266/repository/files/HLS_T36KXE_2018_b30_v2.zarr.zip/raw?ref=main&lfs=true
https://git.geo.tuwien.ac.at/api/v4/projects/1266/repository/files/label_features_i02.geojson/raw?ref=main&lfs=true
The next step is to load the remote sensing image data and the GeoJSON file containing the regions of interest (ROIs) for classification. These ROIs are defined as polygons that represent areas labeled as forest or non-forest. The polygon labels are created using QGIS and provide the ground truth required for supervised classification.
# Load Data
= xr.open_dataset(url_dc, engine="zarr", consolidated=False)
feature_cube = feature_cube.sel(quantile=slice(0.1, 0.9)) # remove outliers
feature_cube = gpd.read_file(url_slivers) slivers: gpd.GeoDataFrame
# Prepare the crs and affine transformation
dict[str, Any] = feature_cube["spatial_ref"].attrs
spatial_ref: str = spatial_ref["crs_wkt"]
crs: list[float] = [float(x) for x in spatial_ref["GeoTransform"].split()]
_geotransfrom: = Affine.from_gdal(*_geotransfrom)
affine
# Set the crs on the polygons
=True) slivers.to_crs(crs, inplace
Defining land cover types as an enumeration provides a clear mapping between each class and its numeric value. This improves code readability and ensures consistency when assigning and interpreting classification labels throughout the workflow.
class LandCoverType(Enum):
"""Binding land cover types to their numeric values."""
= 10
TREE_COVER = 20
SHRUBLAND = 30
GRASSLAND = 40
CROPLAND = 50
SETTLEMENT = 60
BARE_SPARSE = 70
SNOW_ICE_ABSENT = 80
PERMANENT_WATER = 86
INTERMITTENT_WATER = 90 WETLAND
We will now plot the image data alongside the regions of interest (ROIs) to obtain a comprehensive understanding of the spatial distribution of the labeled areas. This visualization will help us verify the alignment of the labeled polygons with the underlying remote sensing data.
# Plot RGB Image
= plt.subplots()
fig, ax "Red", "Green", "Blue"]].sel(quantile=0.5).to_dataarray().plot.imshow(
feature_cube[[=True,
robust=ax,
ax
)=ax, column="label", cmap="RdYlBu")
slivers.plot(ax plt.show()
17.2 Prepare Data for Classification
Prior to the actual machine learning classification, the data needs to be preprocessed and organized into a suitable format. The preparation involves several key steps: 1. Rasterization of the polygons to create a mask for the areas of interest. 2. Extraction of pixel values from the image data using the mask. 3. Creation of a feature matrix and target vector for classification from the image data.
# https://rasterio.readthedocs.io/en/stable/api/rasterio.features.html#rasterio.features.rasterize
# Rasterize the polygons
= rasterize(
rst: np.ndarray =slivers.to_crs(crs)[["geometry", "poly_id"]]
shapes=False)
.to_records(index
.tolist(),=feature_cube["Red"].shape[1:], # only x and y dimensions
out_shape=affine,
transform
)
# Create a DataArray from the rasterized polygons
= xr.DataArray(
label_cube
rst,={"y": feature_cube.y, "x": feature_cube.x},
coords="poly_id",
name
)
# Create a DataFrame from the image data
= (
feature_frame: pd.DataFrame "spatial_ref")
feature_cube.drop_vars(
.to_dataframe()
.dropna()=["y", "x"], columns="quantile", values=None)
.pivot_table(index# .unstack(level="quantile")
)
# Rename the columns to include quantile information
def _rename_col(col: tuple[str, float]) -> str:
return f"{col[0]}_P{np.round(100 * col[1], 0).astype(int):03}"
= [_rename_col(col) for col in feature_frame.columns] feature_frame.columns
# Create a DataFrame from the label data (originally the polygons)
= (
label_frame: pd.DataFrame
label_cube.to_dataframe()
.reset_index()"poly_id", "label"]])
.merge(slivers[["y", "x"])
.set_index([
)int = 42
SEED: = (
train_frame: pd.DataFrame
label_frame.join(feature_frame)"poly_id")
.groupby(=0.7, random_state=SEED)
.sample(frac
)
# Prepare the features and target variable for classification
= train_frame.drop(columns=["label", "poly_id"])
X_train: pd.DataFrame = train_frame["label"]
y_train: pd.Series
= train_frame.index
_drop_idx: pd.Index = label_frame.join(feature_frame).drop(index=_drop_idx)["label"] y_test: pd.Series
17.3 Classify Forest Cover with Scikit-learn
We will proceed with training a Random Forest classifier using the prepared data. This process involves the following key steps: 1. Data Splitting into Training and Testing Sets to evaluate the model’s performance. 2. Model Training on the Training Set. 3. Model Evaluation on the Testing Set. 4. Visualization of Results.
# Classify Forest Cover with Scikit-learn
= RandomForestClassifier(
clf_rf =SEED,
random_state="balanced",
class_weight=-1,
n_jobs
)
=X_train, y=(y_train == LandCoverType.TREE_COVER)) clf_rf.fit(X
RandomForestClassifier(class_weight='balanced', n_jobs=-1, random_state=42)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Parameters
n_estimators | 100 | |
criterion | 'gini' | |
max_depth | None | |
min_samples_split | 2 | |
min_samples_leaf | 1 | |
min_weight_fraction_leaf | 0.0 | |
max_features | 'sqrt' | |
max_leaf_nodes | None | |
min_impurity_decrease | 0.0 | |
bootstrap | True | |
oob_score | False | |
n_jobs | -1 | |
random_state | 42 | |
verbose | 0 | |
warm_start | False | |
class_weight | 'balanced' | |
ccp_alpha | 0.0 | |
max_samples | None | |
monotonic_cst | None |
With the trained model, we can now make predictions on the image data. The Random Forest classifier will be used to predict the forest cover for each pixel in the image. For that the image data needs to be prepared in a similar way as the training data, as the model expects essentially a vector of features.
# Predict using the trained model
= pd.DataFrame(
class_frame
clf_rf.predict(feature_frame),=feature_frame.index,
index=["RF"],
columns
)
# Recreate the DataArray for the predicted forest cover
= (
predicted_forest
class_frame.to_xarray().rio.write_crs(crs).rio.write_transform(affine) )
# Map the predicted forest classification
== 1).hvplot.image(
predicted_forest.where(predicted_forest ="x",
x="y",
y=True,
tiles="greens",
cmap=crs,
crs=False,
hover )