PCA for 8-band satellite imagery with python

Create a standardized Principal Component Analysis (PCA) raster stack from 8-band multispectral satellite imagery. Python script at the end of the article. The full jupyter notebook is available here.

Background

Principle Component Analysis¹ is a technique for reducing the volume or dimensionality of a complex dataset while preserving as much of the information as possible.

In this workflow we will see how a PCA can reduce an 8-band satellite image into a 3-band image while preserving over 95% of the information.

Data

For this exercise we will be evaluating a single scene from Planet Labs SuperDove² constellation. The earliest imagery available is from around mid-March, 2020. The constellation is acquiring new imagery of the entire planet almost daily.

Each SuperDove scene covers approximately 32.5 x 19.6 km (637 km²), and has a spatial resolution of about 3 meters. Each scene file is about 980Mb on disk. A small example dataset is available in the jupyter notebook repo or at the following link³.

Load Data

First, our libraries and data...

from rasterio.plot import show
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import dask.array as da
import rasterio

filename = './data/analytic/demo.tif'
src = rasterio.open(filename)

n_bands = src.meta['count']
img_shape = src.shape

band_numbers = ['1', '2', '3', '4', '5', '6', '7', '8']
pc_names = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8']

band_names = ['coastal_blue',
              'blue',
              'green_i',
              'green',
              'yellow',
              'red',
              'red_edge',
              'nir']

Summary Statistics

Now, we will look at the raw data and it's distributions.

stats_raw = []

for idx, band in enumerate(src.read()):

    stats_raw.append({
        'band': idx + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),

    })

summary_stats = pd.DataFrame.from_records(
    stats_raw, index='band').astype(int)

summary_stats

Table 1. Summary Statistics for Surface Reflectance Values.

	min	mean	max	std	var
band
1	1	104	2024	49	2456
2	1	78	2183	51	2686
3	8	230	2346	72	5192
4	37	311	2686	83	6925
5	17	210	2582	82	6804
6	1	118	2670	84	7156
7	120	767	3664	142	20305
8	460	3763	5970	385	148705

Here we can see band 8, the NIR band, tends to have values much larger than the other bands. We will use a min-max normalization and the correlation matrix to produce a standardized PCA.

Look at the Data

vals = []
for idx, band in enumerate(src.read()):
    vals.append({
        'band': idx + 1,
        'vmin': int(band.mean() - np.std(band)),
        'vmax': int(band.mean() + np.std(band)),
    })

fig, axes = plt.subplots(
    nrows=3,
    ncols=3,
    figsize=(10, 10),
    sharey=True)

ax1, ax2, ax3, ax4, ax5, ax6, ax7, ax8, ax9 = axes.flatten()

fig.delaxes(ax9)

# plots
show((src, 1), cmap='Greys_r', ax=ax1,
     vmin=vals[0]['vmin'], vmax=vals[0]['vmax'])
show((src, 2), cmap='Greys_r', ax=ax2,
     vmin=vals[1]['vmin'], vmax=vals[1]['vmax'])
show((src, 3), cmap='Greys_r', ax=ax3,
     vmin=vals[2]['vmin'], vmax=vals[2]['vmax'])
show((src, 4), cmap='Greys_r', ax=ax4,
     vmin=vals[3]['vmin'], vmax=vals[3]['vmax'])
show((src, 5), cmap='Greys_r', ax=ax5,
     vmin=vals[4]['vmin'], vmax=vals[4]['vmax'])
show((src, 6), cmap='Greys_r', ax=ax6,
     vmin=vals[5]['vmin'], vmax=vals[5]['vmax'])
show((src, 7), cmap='Greys_r', ax=ax7,
     vmin=vals[6]['vmin'], vmax=vals[6]['vmax'])
show((src, 8), cmap='Greys_r', ax=ax8,
     vmin=vals[7]['vmin'], vmax=vals[7]['vmax'])

# add titles
ax1.set_title('coastal_blue')
ax2.set_title('blue')
ax3.set_title('green_i')
ax4.set_title('green')
ax5.set_title('yellow')
ax6.set_title('red')
ax7.set_title('red_edge')
ax8.set_title('nir')

Fig 2: each band presented in greyscale with color mapping around mean +/- one standard deviation.

bnd_list = []
for i in range(n_bands):
    bnd = da.from_array(src.read(i+1))
    bnd_list.append(bnd)

dask_band_stack = da.dstack(bnd_list)
dask_band_stack = dask_band_stack.rechunk('auto')
src_sr = dask_band_stack.compute()

A good way to visualize the distribution of pixel values across the 8 bands is to use a kernel-density-estimate plot from seaborn. For the smaller example image this takes about a minute.

for i in range(n_bands):
    sns.kdeplot(src_sr[:, :, i].flatten())

plt.title('Density Plot with no normalization')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(band_names)

Fig 3: Kernel Density Estimate plot of the raw satellite data.

Normalize the data

Because the values in the NIR band (band 8) are so much larger than the other bands it can skew the correlation and eigen values, so we will perform a min-max normalization on the data first. Also, calculating correlation coefficients requires us to flatten the data from a 3-dimensional matrix down to a two-dimensional matrix. the band layers are preserved, but the rows and columns of the image are cast into one long vector. We will reshape the data before we are done.

flat_bnds = np.zeros((src_sr[:, :, 0].size, n_bands))

for i in range(n_bands):
    bnd_i = src_sr[:, :, i].flatten()
    bnd_norm = 1024*((bnd_i - bnd_i.min())/(bnd_i.max() - bnd_i.min()))
    flat_bnds[:, i] = bnd_norm

stats_norm = []
for i in range(n_bands):
    band = flat_bnds[:, i]

    stats_norm.append({
        'band': i + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),
    })

smry_norm = pd.DataFrame.from_records(
    stats_norm, index='band').astype(int)

Table 2. Summary Statistics for Normalized Values.

	min	mean	max	std	var
band
1	0	52	1024	25	629
2	0	36	1024	24	591
3	0	97	1024	31	996
4	0	105	1024	32	1034
5	0	77	1024	32	1084
6	0	44	1024	32	1053
7	0	187	1024	41	1695
8	0	613	1024	71	5135

# look at data
for i in range(n_bands):
    sns.kdeplot(flat_bnds[:, i])

plt.title('Density Plot with Min-Max Normalization')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(band_names)

Fig X: subtitle

Correlation Matrix

To calculate the Eigen Vectors and Eigen Values we need to compute the correlation matrix.

Using the correlation coefficients instead of the covariance matrix creates a standardized PCA. Eastman and Fulk, 1993, Carr, 1998

(citation) suggests the standardized PCA is better for change detection.

cor_lists = np.corrcoef(flat_bnds.transpose())

cor_data = pd.DataFrame(
    cor_lists,
    columns=band_numbers,
    index=band_numbers,
)

cor_data = round(cor_data, 2)

Table 3: Correlation Coefficents

	1	2	3	4
1	1.00
2	0.80	1.00
3	0.73	0.88	1.00
4	0.72	0.87	0.97	1.00
5	0.77	0.92	0.94	0.94
6	0.79	0.94	0.89	0.88
7	0.63	0.77	0.92	0.94
8	-0.02	0.00	0.23	0.24

	5	6	7	8
5	1.00
6	0.95	1.00
7	0.88	0.79	1.00
8	0.07	-0.05	0.36	1.00

Eigen Values and Vectors

EigVal_cor, EigVec_cor = np.linalg.eig(cor_lists)

std_eigVec_table = pd.DataFrame(
    EigVec_cor,
    columns=band_numbers,
    index=band_numbers
)

std_eigVec_table = round(std_eigVec_table, 3)
std_eigVec_table

Table 5: Eigenvectors

	1	2	3	4
1	0.331	-0.212	-0.854	0.341
2	0.379	-0.165	-0.073	-0.630
3	0.392	0.098	0.154	0.104
4	0.391	0.113	0.192	0.182
5	0.394	-0.062	0.167	-0.038
6	0.382	-0.201	0.071	-0.345
7	0.368	0.261	0.275	0.487
8	0.062	0.891	-0.311	-0.293

	5	6	7	8
1	-0.008	0.001	0.020	-0.001
2	-0.129	0.042	-0.460	-0.444
3	0.538	0.355	-0.431	0.448
4	-0.330	-0.726	-0.235	0.272
5	-0.610	0.508	0.331	0.269
6	0.448	-0.280	0.638	0.061
7	0.107	0.094	0.125	-0.669
8	-0.016	0.006	0.127	0.062

# Ordering Eigen values and vectors
# and projecting data on Eigen vector
# directions results in Principal Components

order2 = EigVal_cor.argsort()[::-1]

EigVal_cor = EigVal_cor[order2]

pc_values_cor = (EigVal_cor/sum(EigVal_cor)*100).round(2)

value = sum(pc_values_cor[0:3])
print(f'variance accounted for by first 3 components of the std: {value}%')

variance accounted for by first 3 components of the std: 95.95%

What Does It Mean?

We can determine correlation between the original bands and the newly calculate principle components by calculating a table of factor loadings⁴

In the table below, we see that PC8 has the highest factor loadings with the NIR and Red Edge bands, so PC1 has retained most of the information in those bands.

PC8, however, has very low factor loading values and contains mostly noise.

loading_cor = pd.DataFrame(
    EigVec_cor * np.sqrt(EigVal_cor) / np.sqrt(np.abs(cor_lists)),
    columns=pc_names,
    index=band_names,
)

loading_cor = round(loading_cor, 3)
loading_cor

Table 6: Principle Component Loadings

	PC1	PC2	PC3	PC4
coastal_blue	0.820	-0.257	-0.594	0.154
blue	1.050	-0.178	-0.046	-0.259
green_i	1.137	0.113	0.091	0.040
green	1.145	0.131	0.116	0.070
yellow	1.117	-0.070	0.102	-0.015
red	1.067	-0.224	0.045	-0.141
red_edge	1.154	0.322	0.170	0.193
nir	1.255	16.126	-0.387	-0.230

	PC5	PC6	PC7	PC8
coastal_blue	-0.002	0.000	0.005	-0.002
blue	-0.033	0.010	-0.096	-1.276
green_i	0.137	0.087	-0.082	0.161
green	-0.084	-0.179	-0.044	0.096
yellow	-0.151	0.121	0.065	0.170
red	0.114	-0.065	0.131	0.047
red_edge	0.028	0.025	0.023	-0.191
nir	-0.014	0.006	0.039	0.011

fig, ax = plt.subplots()

pc_bars = ax.bar(
    [1, 2, 3, 4, 5, 6, 7, 8],
    (EigVal_cor/sum(EigVal_cor)*100).round(2),
    align='center',
    width=0.4,
    tick_label=pc_names,
)

plt.bar_label(pc_bars, labels=[f'{x/100:.2%}' for x in pc_bars.datavalues])
plt.ylabel('Variance (%)')
plt.title('Information retention - standardized (correlation)')

Fig X: subtitle

Create the PCA components for the Imagery

PC_std = np.matmul(flat_bnds, EigVec_cor)  # cross product

print(PC_std.shape)

(5385876, 8)

stats_std_PC = []
for i in range(n_bands):
    band = PC_std[:, i]

    stats_std_PC.append({
        'band': i + 1,
        'min': band.min(),
        'mean': band.mean(),
        'max': band.max(),
        'std': int(np.std(band)),
        'var': int(np.var(band)),
    })

smry_std_PC = pd.DataFrame.from_records(
    stats_std_PC, index='band').astype(int)

	min	mean	max	std	var
band
1	25	265	2541	78	6206
2	-4	586	1001	69	4890
3	-548	-135	136	23	552
4	-270	-82	80	17	311
5	-93	-4	72	6	36
6	-48	7	79	5	31
7	-92	73	269	10	116
8	-74	-7	168	8	76

Reshape to image stack

PC_std_2d = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d[:, :, i] = PC_std[:, i].reshape(-1, img_shape[1])

for i in range(3):
    sns.kdeplot(PC_std_2d[:, :, i].flatten())

plt.title('Density Plot for PC bands (standardized)')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(pc_names[0:3])

Fig X: subtitle

PC_nrm = np.zeros(PC_std.shape)

for i in range(n_bands):
    PC_i = PC_std[:, i]
    pc_norm_mm = 1024*(PC_i - PC_i.min())/(PC_i.max() - PC_i.min())
    PC_nrm[:, i] = pc_norm_mm

PC_std_2d_norm = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d_norm[:, :, i] = PC_nrm[:, i].reshape(-1, img_shape[1])

for i in range(3):
    sns.kdeplot(PC_std_2d_norm[:, :, i].flatten())

plt.title('Density Plot for First Three Principle Components')
plt.xlabel('DN')
plt.ylabel('Density')
plt.legend(pc_names[0:3])

Fig X: Density plot for the first three principle components

# Set spatial characteristics of the output object to mirror the input
kwargs = src.meta
kwargs.update(
    dtype=rasterio.float32,
    count=8)

# band, row, col
tmp_std_norm = np.moveaxis(PC_std_2d_norm, [0, 1, 2], [2, 1, 0])
PC_img_std_norm = np.swapaxes(tmp_std_norm, 1, 2)

with rasterio.open('./data/analytic/0_derived/PC_img_std_norm.tif', 'w', **kwargs) as dst:
    dst.write(PC_img_std_norm.astype(rasterio.float32))

References and Links

1 Wikipedia: Principle Component Analysis ↵

2. Planet Labs SuperDove Constellation ↵

3. Download Sample Planet Labs Data ↵

4. Factor Loadings ↵

import rasterio
import dask.array as da
import numpy as np

filename = 'path/to/tif.tif'
src = rasterio.open(filename)

n_bands = src.meta['count']
img_shape = src.shape

bnd_list = []
for i in range(n_bands):
    bnd = da.from_array(src.read(i+1))
    bnd_list.append(bnd)

dask_band_stack = da.dstack(bnd_list)
dask_band_stack = dask_band_stack.rechunk('auto')
src_sr = dask_band_stack.compute()

flat_bnds = np.zeros((src_sr[:, :, 0].size, n_bands))

for i in range(n_bands):
    bnd_i = src_sr[:, :, i].flatten()
    bnd_norm = 1024*((bnd_i - bnd_i.min())/(bnd_i.max() - bnd_i.min()))
    flat_bnds[:, i] = bnd_norm

cor_lists = np.corrcoef(flat_bnds.transpose())

EigVal_cor, EigVec_cor = np.linalg.eig(cor_lists)
order2 = EigVal_cor.argsort()[::-1]
EigVal_cor = EigVal_cor[order2]

pc_values_cor = (EigVal_cor/sum(EigVal_cor)*100).round(2)

PC_std = np.matmul(flat_bnds, EigVec_cor)

PC_nrm = np.zeros(PC_std.shape)
for i in range(n_bands):
    PC_i = PC_std[:, i]
    pc_norm_mm = 1024*(PC_i - PC_i.min())/(PC_i.max() - PC_i.min())
    PC_nrm[:, i] = pc_norm_mm

PC_std_2d_norm = np.zeros((img_shape[0], img_shape[1], n_bands))
for i in range(n_bands):
    PC_std_2d_norm[:, :, i] = PC_nrm[:, i].reshape(-1, img_shape[1])

kwargs = src.meta
kwargs.update(
    dtype=rasterio.float32,
    count=8)

tmp_std_norm = np.moveaxis(PC_std_2d_norm, [0, 1, 2], [2, 1, 0])
PC_img_std_norm = np.swapaxes(tmp_std_norm, 1, 2)

with rasterio.open('./output.tif', 'w', **kwargs) as dst:
    dst.write(PC_img_std_norm.astype(rasterio.float32))