Calculate statistics on segments

Log into jupyter.gis.unbc.ca

Open a new terminal

  • mount gisfs
  • install rasterstats with conda (conda install rasterstats) after a couple minutes accept the solution
  • installing this packages changes gdal path update the system to correct for this (sudo apt update && sudo apt upgrade)

Start a notebook for todays lab

Import packages

from rasterstats import zonal_stats
import geopandas as gpd
import pandas as pd
import numpy as np
import sys
import pickle
import matplotlib.pyplot as plt

Add file paths for easy reference

lab_root = "/home/jovyan/gisfs/L/GEOG457/Lab8/"
image_root = lab_root + "dstl-satellite-imagery-feature-detection/sixteen_band/sixteen_band/"

Open segment as a GeoPandas DataFrame

training = gpd.read_file(lab_root + "training_data/polygons.shp")

class_list = [1, 2] #Limit Classes

training_class = training[training['ClassType'].astype(str).str.contains('|'.join(str(class_list)))]

#View loaded data
training_class.tail

Determine what images match our segments

image_set = training_class.ImageId.unique()[:4] #Limit Images
image_set

# Only keep segments we have images for
training_filtered = training_class[training_class['ImageId'].str.contains('|'.join(image_set))]
training_filtered

Add fields to data frame to store statistics (I would strongly recommend against typing this).

def seventeen_band_stats(original):
    return gpd.GeoDataFrame( pd.concat( [original, pd.DataFrame({'b1_min':[np.nan], 'b1_max':[np.nan], 'b1_mean':[np.nan], 'b1_count':[np.nan], 'b1_std':[np.nan], 'b1_median':[np.nan], 'b2_min':[np.nan], 'b2_max':[np.nan], 'b2_mean':[np.nan], 'b2_count':[np.nan], 'b2_std':[np.nan], 'b2_median':[np.nan], 'b3_min':[np.nan], 'b3_max':[np.nan], 'b3_mean':[np.nan], 'b3_count':[np.nan], 'b3_std':[np.nan], 'b3_median':[np.nan], 'b4_min':[np.nan], 'b4_max':[np.nan], 'b4_mean':[np.nan], 'b4_count':[np.nan], 'b4_std':[np.nan], 'b4_median':[np.nan], 'b5_min':[np.nan], 'b5_max':[np.nan], 'b5_mean':[np.nan], 'b5_count':[np.nan], 'b5_std':[np.nan], 'b5_median':[np.nan], 'b6_min':[np.nan], 'b6_max':[np.nan], 'b6_mean':[np.nan], 'b6_count':[np.nan], 'b6_std':[np.nan], 'b6_median':[np.nan], 'b7_min':[np.nan], 'b7_max':[np.nan], 'b7_mean':[np.nan], 'b7_count':[np.nan], 'b7_std':[np.nan], 'b7_median':[np.nan], 'b8_min':[np.nan], 'b8_max':[np.nan], 'b8_mean':[np.nan], 'b8_count':[np.nan], 'b8_std':[np.nan], 'b8_median':[np.nan], 'b9_min':[np.nan], 'b9_max':[np.nan], 'b9_mean':[np.nan], 'b9_count':[np.nan], 'b9_std':[np.nan], 'b9_median':[np.nan], 'b10_min':[np.nan], 'b10_max':[np.nan], 'b10_mean':[np.nan], 'b10_count':[np.nan], 'b10_std':[np.nan], 'b10_median':[np.nan], 'b11_min':[np.nan], 'b11_max':[np.nan], 'b11_mean':[np.nan], 'b11_count':[np.nan], 'b11_std':[np.nan], 'b11_median':[np.nan], 'b12_min':[np.nan], 'b12_max':[np.nan], 'b12_mean':[np.nan], 'b12_count':[np.nan], 'b12_std':[np.nan], 'b12_median':[np.nan], 'b13_min':[np.nan], 'b13_max':[np.nan], 'b13_mean':[np.nan], 'b13_count':[np.nan], 'b13_std':[np.nan], 'b13_median':[np.nan], 'b14_min':[np.nan], 'b14_max':[np.nan], 'b14_mean':[np.nan], 'b14_count':[np.nan], 'b14_std':[np.nan], 'b14_median':[np.nan], 'b15_min':[np.nan], 'b15_max':[np.nan], 'b15_mean':[np.nan], 'b15_count':[np.nan], 'b15_std':[np.nan], 'b15_median':[np.nan], 'b16_min':[np.nan], 'b16_max':[np.nan], 'b16_mean':[np.nan], 'b16_count':[np.nan], 'b16_std':[np.nan], 'b16_median':[np.nan], 'b17_min':[np.nan], 'b17_max':[np.nan], 'b17_mean':[np.nan], 'b17_count':[np.nan], 'b17_std':[np.nan], 'b17_median':[np.nan]})], ignore_index=True)).head(len(original))

  
training_stats = seventeen_band_stats(training_filtered)

#onfirm df was updated
training_stats

For each image we opened calculate statistics for all bands, and classes

for image in image_set:
    dataA = image_root + image + "_A.tif"
    dataM = image_root + image + "_M.tif"
    dataP = image_root + image + "_P.tif"
    print("Image: " + image)
    
    for cl in class_list:
        print("\tClass: " + str(cl))
        subset = training_stats[training_stats['ImageId'].str.contains(image)]
        subset = subset[subset['ClassType'] == cl]
        subset = subset[:min(len(subset), 25)] # Limit samples
        if(len(subset) > 0):
            band_1 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataP, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_2 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=2, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_3 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=3, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_4 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=5, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_5 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=7, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_6 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_7 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=4, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_8 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=6, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_9 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=8, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_10 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_11 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=2, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_12 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=3, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_13 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=4, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_14 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=5, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_15 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=6, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_16 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=7, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))
            band_17 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=8, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median'])))

            training_stats.loc[subset.index, ['b1_min', 'b1_max', 'b1_mean', 'b1_count', 'b1_std', 'b1_median']] = band_1[:,:]
            training_stats.loc[subset.index, ['b2_min', 'b2_max', 'b2_mean', 'b2_count', 'b2_std', 'b2_median']] = band_2[:,:]
            training_stats.loc[subset.index, ['b3_min', 'b3_max', 'b3_mean', 'b3_count', 'b3_std', 'b3_median']] = band_3[:,:]
            training_stats.loc[subset.index, ['b4_min', 'b4_max', 'b4_mean', 'b4_count', 'b4_std', 'b4_median']] = band_4[:,:]
            training_stats.loc[subset.index, ['b5_min', 'b5_max', 'b5_mean', 'b5_count', 'b5_std', 'b5_median']] = band_5[:,:]
            training_stats.loc[subset.index, ['b6_min', 'b6_max', 'b6_mean', 'b6_count', 'b6_std', 'b6_median']] = band_6[:,:]
            training_stats.loc[subset.index, ['b7_min', 'b7_max', 'b7_mean', 'b7_count', 'b7_std', 'b7_median']] = band_7[:,:]
            training_stats.loc[subset.index, ['b8_min', 'b8_max', 'b8_mean', 'b8_count', 'b8_std', 'b8_median']] = band_8[:,:]
            training_stats.loc[subset.index, ['b9_min', 'b9_max', 'b9_mean', 'b9_count', 'b9_std', 'b9_median']] = band_9[:,:]
            training_stats.loc[subset.index, ['b10_min', 'b10_max', 'b10_mean', 'b10_count', 'b10_std', 'b10_median']] = band_10[:,:]
            training_stats.loc[subset.index, ['b11_min', 'b11_max', 'b11_mean', 'b11_count', 'b11_std', 'b11_median']] = band_11[:,:]
            training_stats.loc[subset.index, ['b12_min', 'b12_max', 'b12_mean', 'b12_count', 'b12_std', 'b12_median']] = band_12[:,:]
            training_stats.loc[subset.index, ['b14_min', 'b14_max', 'b14_mean', 'b14_count', 'b14_std', 'b14_median']] = band_14[:,:]
            training_stats.loc[subset.index, ['b15_min', 'b15_max', 'b15_mean', 'b15_count', 'b15_std', 'b15_median']] = band_15[:,:]
            training_stats.loc[subset.index, ['b16_min', 'b16_max', 'b16_mean', 'b16_count', 'b16_std', 'b16_median']] = band_16[:,:]
            training_stats.loc[subset.index, ['b17_min', 'b17_max', 'b17_mean', 'b17_count', 'b17_std', 'b17_median']] = band_17[:,:]
        print("\t" + str(len(subset)) + " samplels")

Remove null polygons (too small to have values)

training_cleaned = training_stats[training_stats['b17_count'] > 0]

Pickle your results for later use

training_cleaned.to_pickle(lab_root + "zonal_stats.pkl")

Classification

from sklearn.model_selection import StratifiedKFold
from sklearn import svm
from sklearn.preprocessing import normalize
with open('training_stats.pkl', 'rb') as handle:
    stats = pickle.load(handle)
#stats = stats.drop({"ImageId", "geometry"}, axis=1)
stats = stats.loc[:, stats.columns != 'geometry']
stats = stats.loc[:, stats.columns != 'ImageId']
class_list = ['1.0', '2']
filter_stats = stats[stats['ClassType'].astype(str).str.match('|'.join(class_list))]
cor = filter_stats.corr()
correlation_target = abs(cor['ClassType'])
relevant_features = correlation_target.sort_values(ascending=False)[1:3]
col_name = relevant_features.index
plt.figure(figsize=(15, 12))
plt.scatter(filter_stats[col_name[0]], filter_stats[col_name[1]], c=filter_stats['ClassType'])
plt.colorbar()
plt.xlabel(col_name[0])
plt.ylabel(col_name[1])
plt.show()
features = normalize(np.array(filter_stats).astype(int))
labels = np.array(filter_stats['ClassType'])
folds = 4
kfold = StratifiedKFold(n_splits=folds, shuffle=True)

fold = 0
model = np.empty([folds], dtype=object)
results = np.empty([folds], dtype=float)
for train_index, test_index in kfold.split(X=features, y=labels):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_features, train_labels = features[train_index], labels[train_index]
    test_features, test_labels = features[test_index], labels[test_index]
    print(train_features.shape)
    print(train_labels)
    SVM = svm.LinearSVC()
    SVM.fit(train_features, train_labels)
    
    # Run model on test data
    predictions = SVM.predict(test_features)
    
    # Convert results to pandas series, fill in missing classes with 0 values
    y_actual = pd.Series(test_labels, name="Actual")
    y_predicted = pd.Series(predictions, name="Predicted")
    accuracy = sum(y_actual == y_predicted)/y_actual.size*100
    model[fold] = SVM
    results[fold] = accuracy
    fold += 1
    print("Accuracy: " + str(round(accuracy, 1)) + "%")
    # Calculate confusion matrix (Margins add All summary)
    confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True, dropna=False).reindex(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], fill_value=0)
    print()
    print(confusion)
    print()
 
print(results)
print("Overall Average: " + str(round(np.mean(results),2)) + "\tStd.Dev: " + str(round(np.std(results),2)))
from sklearn.ensemble import RandomForestClassifier

features = normalize(np.array(filter_stats).astype(int))
labels = np.array(filter_stats['ClassType'])
folds = 4
kfold = StratifiedKFold(n_splits=folds, shuffle=True)

fold = 0
model = np.empty([folds], dtype=object)
results = np.empty([folds], dtype=float)
for train_index, test_index in kfold.split(X=features, y=labels):
    print("TRAIN:", train_index, "TEST:", test_index)
    train_features, train_labels = features[train_index], labels[train_index]
    test_features, test_labels = features[test_index], labels[test_index]
    print(train_features.shape)
    print(train_labels)
    
    # Initialize random forest, and set number of trees to be used in ensemble
    rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
        
    rf.fit(train_features, train_labels)
    
    # Run model on test data
    predictions = rf.predict(test_features)
    
    # Convert results to pandas series, fill in missing classes with 0 values
    y_actual = pd.Series(test_labels, name="Actual")
    y_predicted = pd.Series(predictions, name="Predicted")
    accuracy = sum(y_actual == y_predicted)/y_actual.size*100
    model[fold] = rf
    results[fold] = accuracy
    fold += 1
    print("Accuracy: " + str(round(accuracy, 1)) + "%")
    # Calculate confusion matrix (Margins add All summary)
    confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True, dropna=False).reindex(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], fill_value=0)
    print()
    print(confusion)
    print()
 
print(results)
print("Overall Average: " + str(round(np.mean(results),2)) + "\tStd.Dev: " + str(round(np.std(results),2)))
from sklearn.tree import export_graphviz
export_graphviz(model[0].estimators_[0], out_file='random_forest.dot', class_names = ['Building', 'Misc, Structure', 'Road', 'Track', 'Trees', 'Crops', 'Swift Water', 'Still Water', 'Truck', 'Car'], filled=True, rounded=True)

Assignment

  • Implement above classification using 3-band data (us the _M files from sixteen band loading only the first 3 channels), you will need to update data structures and the way images are open to accommodate this. Limit images 8, max samples/class/image 250, use all class
    • Submit entire notebook with results (file download)
    • Make sue Confusion Matrices and Overall Accuracy are printed).
  • Answer the following based upon the 17 band data in the uploaded pickle. And with the following classes 1 – ‘Building’, 2 – ‘Misc, Structure’, 3 – ‘Road’, 4 – ‘Track’, 5 – ‘Trees’, 6 – ‘Crops’, 7 – ‘Swift Water’, 8 – ‘Still Water’, 9 – ‘Truck’, 10 – ‘Car’
    • What are the top two statistics correlating between Swift Water and Still Water?
    • What is the change in SVM accuracy at discriminating between the top two determinants, and the top 10 determinints.
      Tip: Change relevant_features = correlation_target.sort_values(ascending=False)[1:3] to relevant_features = correlation_target.sort_values(ascending=False)[1:11]
    • Try running both SVM and Random Forest with all determinants (relevant_features = correlation_target.sort_values(ascending=False)[1:]). Do they run? how does accuracy compare.
    • For random forest, using the top 4 determinants, and all classes. Compare accuracy between max_depth=3, 4, and 5.

Categories: Labs