PCA for 8-band satellite imagery with python

Create a standardized Principal Component Analysis (PCA) raster stack from 8-band multispectral satellite imagery. Python script at the end of the article. The full jupyter notebook is available here.

Background

Principle Component Analysis1 is a technique for reducing the volume or dimensionality of a complex dataset while preserving as much of the information as possible.

In this workflow we will see how a PCA can reduce an 8-band satellite image into a 3-band image while preserving over 95% of the information.

Data

For this exercise we will be evaluating a single scene from Planet Labs SuperDove2 constellation. The earliest imagery available is from around mid-March, 2020. The constellation is acquiring new imagery of the entire planet almost daily.

Each SuperDove scene covers approximately 32.5 x 19.6 km (637 km2), and has a spatial resolution of about 3 meters. Each scene file is about 980Mb on disk. A small example dataset is available in the jupyter notebook repo or at the following link3.

Load Data

First, our libraries and data...

from rasterio.plot import show
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dask.array as da
import rasterio

filename = './data/analytic/demo.tif'
src = rasterio.open(filename)

n_bands = src.meta['count']
img_shape = src.shape

band_numbers = ['1', '2', '3', '4', '5', '6', '7', '8']
pc_names = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8']

band_names = ['coastal_blue',
              'blue',
              'green_i',
              'green',
              'yellow',
              'red',
              'red_edge',
              'nir']

Summary Statistics

Now, we will look at the raw data and it's distributions.

stats_raw = []

for idx, band in enumerate(src.read()):

    stats_raw.append({
        'band': idx + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),

    })

summary_stats = pd.DataFrame.from_records(
    stats_raw, index='band').astype(int)

summary_stats
Table 1. Summary Statistics for Surface Reflectance Values.
minmeanmaxstdvar
band
111042024492456
21782183512686
382302346725192
4373112686836925
5172102582826804
611182670847156
7120767366414220305
846037635970385148705

Here we can see band 8, the NIR band, tends to have values much larger than the other bands. We will use a min-max normalization and the correlation matrix to produce a standardized PCA.

Look at the Data

vals = []
for idx, band in enumerate(src.read()):
    vals.append({
        'band': idx + 1,
        'vmin': int(band.mean() - np.std(band)),
        'vmax': int(band.mean() + np.std(band)),
    })

fig, axes = plt.subplots(
    nrows=3,
    ncols=3,
    figsize=(10, 10),
    sharey=True)

ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = axes.flatten()

fig.delaxes(ax9)

# plots
show((src, 1), cmap='Greys_r', ax=ax1,
     vmin=vals[0]['vmin'], vmax=vals[0]['vmax'])
show((src, 2), cmap='Greys_r', ax=ax2,
     vmin=vals[1]['vmin'], vmax=vals[1]['vmax'])
show((src, 3), cmap='Greys_r', ax=ax3,
     vmin=vals[2]['vmin'], vmax=vals[2]['vmax'])
show((src, 4), cmap='Greys_r', ax=ax4,
     vmin=vals[3]['vmin'], vmax=vals[3]['vmax'])
show((src, 5), cmap='Greys_r', ax=ax5,
     vmin=vals[4]['vmin'], vmax=vals[4]['vmax'])
show((src, 6), cmap='Greys_r', ax=ax6,
     vmin=vals[5]['vmin'], vmax=vals[5]['vmax'])
show((src, 7), cmap='Greys_r', ax=ax7,
     vmin=vals[6]['vmin'], vmax=vals[6]['vmax'])
show((src, 8), cmap='Greys_r', ax=ax8,
     vmin=vals[7]['vmin'], vmax=vals[7]['vmax'])

# add titles
ax1.set_title('coastal_blue')
ax2.set_title('blue')
ax3.set_title('green_i')
ax4.set_title('green')
ax5.set_title('yellow')
ax6.set_title('red')
ax7.set_title('red_edge')
ax8.set_title('nir')

Fig 2: each band presented in greyscale with color mapping around mean +/- one standard deviation.

bnd_list = []
for i in range(n_bands):
    bnd = da.from_array(src.read(i+1))
    bnd_list.append(bnd)

dask_band_stack = da.dstack(bnd_list)
dask_band_stack = dask_band_stack.rechunk('auto')
src_sr = dask_band_stack.compute()

A good way to visualize the distribution of pixel values across the 8 bands is to use a kernel-density-estimate plot from seaborn. For the smaller example image this takes about a minute.

for i in range(n_bands):
    sns.kdeplot(src_sr[:, :, i].flatten())

plt.title('Density Plot with no normalization')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(band_names)

Fig 3: Kernel Density Estimate plot of the raw satellite data.

Normalize the data

Because the values in the NIR band (band 8) are so much larger than the other bands it can skew the correlation and eigen values, so we will perform a min-max normalization on the data first. Also, calculating correlation coefficients requires us to flatten the data from a 3-dimensional matrix down to a two-dimensional matrix. the band layers are preserved, but the rows and columns of the image are cast into one long vector. We will reshape the data before we are done.

flat_bnds = np.zeros((src_sr[:, :, 0].size, n_bands))

for i in range(n_bands):
    bnd_i = src_sr[:, :, i].flatten()
    bnd_norm = 1024*((bnd_i - bnd_i.min())/(bnd_i.max() - bnd_i.min()))
    flat_bnds[:, i] = bnd_norm
stats_norm = []
for i in range(n_bands):
    band = flat_bnds[:, i]

    stats_norm.append({
        'band': i + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),
    })

smry_norm = pd.DataFrame.from_records(
    stats_norm, index='band').astype(int)
Table 2. Summary Statistics for Normalized Values.
minmeanmaxstdvar
band
1052102425629
2036102424591
3097102431996
401051024321034
50771024321084
60441024321053
701871024411695
806131024715135
# look at data
for i in range(n_bands):
    sns.kdeplot(flat_bnds[:, i])

plt.title('Density Plot with Min-Max Normalization')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(band_names)

Fig X: subtitle

Correlation Matrix

To calculate the Eigen Vectors and Eigen Values we need to compute the correlation matrix.

Using the correlation coefficients instead of the covariance matrix creates a standardized PCA. Eastman and Fulk, 1993, Carr, 1998

(citation) suggests the standardized PCA is better for change detection.

cor_lists = np.corrcoef(flat_bnds.transpose())

cor_data = pd.DataFrame(
    cor_lists,
    columns=band_numbers,
    index=band_numbers,
)

cor_data = round(cor_data, 2)
Table 3: Correlation Coefficents
1234
11.00
20.801.00
30.730.881.00
40.720.870.971.00
50.770.920.940.94
60.790.940.890.88
70.630.770.920.94
8-0.020.000.230.24
5678
51.00
60.951.00
70.880.791.00
80.07-0.050.361.00

Eigen Values and Vectors

EigVal_cor, EigVec_cor = np.linalg.eig(cor_lists)

std_eigVec_table = pd.DataFrame(
    EigVec_cor,
    columns=band_numbers,
    index=band_numbers
)

std_eigVec_table = round(std_eigVec_table, 3)
std_eigVec_table
Table 5: Eigenvectors
1234
10.331-0.212-0.8540.341
20.379-0.165-0.073-0.630
30.3920.0980.1540.104
40.3910.1130.1920.182
50.394-0.0620.167-0.038
60.382-0.2010.071-0.345
70.3680.2610.2750.487
80.0620.891-0.311-0.293
5678
1-0.0080.0010.020-0.001
2-0.1290.042-0.460-0.444
30.5380.355-0.4310.448
4-0.330-0.726-0.2350.272
5-0.6100.5080.3310.269
60.448-0.2800.6380.061
70.1070.0940.125-0.669
8-0.0160.0060.1270.062
# Ordering Eigen values and vectors
# and projecting data on Eigen vector
# directions results in Principal Components

order2 = EigVal_cor.argsort()[::-1]

EigVal_cor = EigVal_cor[order2]
pc_values_cor = (EigVal_cor/sum(EigVal_cor)*100).round(2)
value = sum(pc_values_cor[0:3])
print(f'variance accounted for by first 3 components of the std: {value}%')

variance accounted for by first 3 components of the std: 95.95%

What Does It Mean?

We can determine correlation between the original bands and the newly calculate principle components by calculating a table of factor loadings4

In the table below, we see that PC8 has the highest factor loadings with the NIR and Red Edge bands, so PC1 has retained most of the information in those bands.

PC8, however, has very low factor loading values and contains mostly noise.

loading_cor = pd.DataFrame(
    EigVec_cor * np.sqrt(EigVal_cor) / np.sqrt(np.abs(cor_lists)),
    columns=pc_names,
    index=band_names,
)

loading_cor = round(loading_cor, 3)
loading_cor
Table 6: Principle Component Loadings
PC1PC2PC3PC4
coastal_blue0.820-0.257-0.5940.154
blue1.050-0.178-0.046-0.259
green_i1.1370.1130.0910.040
green1.1450.1310.1160.070
yellow1.117-0.0700.102-0.015
red1.067-0.2240.045-0.141
red_edge1.1540.3220.1700.193
nir1.25516.126-0.387-0.230
PC5PC6PC7PC8
coastal_blue-0.0020.0000.005-0.002
blue-0.0330.010-0.096-1.276
green_i0.1370.087-0.0820.161
green-0.084-0.179-0.0440.096
yellow-0.1510.1210.0650.170
red0.114-0.0650.1310.047
red_edge0.0280.0250.023-0.191
nir-0.0140.0060.0390.011
fig, ax = plt.subplots()

pc_bars = ax.bar(
    [1, 2, 3, 4, 5, 6, 7, 8],
    (EigVal_cor/sum(EigVal_cor)*100).round(2),
    align='center',
    width=0.4,
    tick_label=pc_names,
)

plt.bar_label(pc_bars, labels=[f'{x/100:.2%}' for x in pc_bars.datavalues])
plt.ylabel('Variance (%)')
plt.title('Information retention - standardized (correlation)')

Fig X: subtitle

Create the PCA components for the Imagery

PC_std = np.matmul(flat_bnds, EigVec_cor)  # cross product

print(PC_std.shape)

(5385876, 8)

stats_std_PC = []
for i in range(n_bands):
    band = PC_std[:, i]

    stats_std_PC.append({
        'band': i + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),
    })

smry_std_PC = pd.DataFrame.from_records(
    stats_std_PC, index='band').astype(int)
minmeanmaxstdvar
band
1252652541786206
2-45861001694890
3-548-13513623552
4-270-828017311
5-93-472636
6-48779531
7-927326910116
8-74-7168876

Reshape to image stack

PC_std_2d = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d[:, :, i] = PC_std[:, i].reshape(-1, img_shape[1])

for i in range(3):
    sns.kdeplot(PC_std_2d[:, :, i].flatten())

plt.title('Density Plot for PC bands (standardized)')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(pc_names[0:3])

Fig X: subtitle

PC_nrm = np.zeros(PC_std.shape)

for i in range(n_bands):
    PC_i = PC_std[:, i]
    pc_norm_mm = 1024*(PC_i - PC_i.min())/(PC_i.max() - PC_i.min())
    PC_nrm[:, i] = pc_norm_mm

PC_std_2d_norm = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d_norm[:, :, i] = PC_nrm[:, i].reshape(-1, img_shape[1])
for i in range(3):
    sns.kdeplot(PC_std_2d_norm[:, :, i].flatten())

plt.title('Density Plot for First Three Principle Components')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(pc_names[0:3])

Fig X: Density plot for the first three principle components

# Set spatial characteristics of the output object to mirror the input
kwargs = src.meta
kwargs.update(
    dtype=rasterio.float32,
    count=8)
# band, row, col
tmp_std_norm = np.moveaxis(PC_std_2d_norm, [0, 1, 2], [2, 1, 0])
PC_img_std_norm = np.swapaxes(tmp_std_norm, 1, 2)
with rasterio.open('./data/analytic/0_derived/PC_img_std_norm.tif', 'w', **kwargs) as dst:
    dst.write(PC_img_std_norm.astype(rasterio.float32))

References and Links

1 Wikipedia: Principle Component Analysis

2. Planet Labs SuperDove Constellation

3. Download Sample Planet Labs Data

4. Factor Loadings

import rasterio
import dask.array as da
import numpy as np

filename = 'path/to/tif.tif'
src = rasterio.open(filename)

n_bands = src.meta['count']
img_shape = src.shape

bnd_list = []
for i in range(n_bands):
    bnd = da.from_array(src.read(i+1))
    bnd_list.append(bnd)

dask_band_stack = da.dstack(bnd_list)
dask_band_stack = dask_band_stack.rechunk('auto')
src_sr = dask_band_stack.compute()

flat_bnds = np.zeros((src_sr[:, :, 0].size, n_bands))

for i in range(n_bands):
    bnd_i = src_sr[:, :, i].flatten()
    bnd_norm = 1024*((bnd_i - bnd_i.min())/(bnd_i.max() - bnd_i.min()))
    flat_bnds[:, i] = bnd_norm

cor_lists = np.corrcoef(flat_bnds.transpose())

EigVal_cor, EigVec_cor = np.linalg.eig(cor_lists)
order2 = EigVal_cor.argsort()[::-1]
EigVal_cor = EigVal_cor[order2]

pc_values_cor = (EigVal_cor/sum(EigVal_cor)*100).round(2)

PC_std = np.matmul(flat_bnds, EigVec_cor)

PC_nrm = np.zeros(PC_std.shape)
for i in range(n_bands):
    PC_i = PC_std[:, i]
    pc_norm_mm = 1024*(PC_i - PC_i.min())/(PC_i.max() - PC_i.min())
    PC_nrm[:, i] = pc_norm_mm

PC_std_2d_norm = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d_norm[:, :, i] = PC_nrm[:, i].reshape(-1, img_shape[1])

kwargs = src.meta
kwargs.update(
    dtype=rasterio.float32,
    count=8)

tmp_std_norm = np.moveaxis(PC_std_2d_norm, [0, 1, 2], [2, 1, 0])
PC_img_std_norm = np.swapaxes(tmp_std_norm, 1, 2)

with rasterio.open('./output.tif', 'w', **kwargs) as dst:
    dst.write(PC_img_std_norm.astype(rasterio.float32))