Calculate statistics on segments
Log into jupyter.gis.unbc.ca
Open a new terminal
- mount gisfs
- install rasterstats with conda (conda install rasterstats) after a couple minutes accept the solution
- installing this packages changes gdal path update the system to correct for this (sudo apt update && sudo apt upgrade)
Start a notebook for todays lab
Import packages
from rasterstats import zonal_stats import geopandas as gpd import pandas as pd import numpy as np import sys import pickle import matplotlib.pyplot as plt
Add file paths for easy reference
lab_root = "/home/jovyan/gisfs/L/GEOG457/Lab8/" image_root = lab_root + "dstl-satellite-imagery-feature-detection/sixteen_band/sixteen_band/"
Open segment as a GeoPandas DataFrame
training = gpd.read_file(lab_root + "training_data/polygons.shp") class_list = [1, 2] #Limit Classes training_class = training[training['ClassType'].astype(str).str.contains('|'.join(str(class_list)))] #View loaded data training_class.tail
Determine what images match our segments
image_set = training_class.ImageId.unique()[:4] #Limit Images image_set # Only keep segments we have images for training_filtered = training_class[training_class['ImageId'].str.contains('|'.join(image_set))] training_filtered
Add fields to data frame to store statistics (I would strongly recommend against typing this).
def seventeen_band_stats(original): return gpd.GeoDataFrame( pd.concat( [original, pd.DataFrame({'b1_min':[np.nan], 'b1_max':[np.nan], 'b1_mean':[np.nan], 'b1_count':[np.nan], 'b1_std':[np.nan], 'b1_median':[np.nan], 'b2_min':[np.nan], 'b2_max':[np.nan], 'b2_mean':[np.nan], 'b2_count':[np.nan], 'b2_std':[np.nan], 'b2_median':[np.nan], 'b3_min':[np.nan], 'b3_max':[np.nan], 'b3_mean':[np.nan], 'b3_count':[np.nan], 'b3_std':[np.nan], 'b3_median':[np.nan], 'b4_min':[np.nan], 'b4_max':[np.nan], 'b4_mean':[np.nan], 'b4_count':[np.nan], 'b4_std':[np.nan], 'b4_median':[np.nan], 'b5_min':[np.nan], 'b5_max':[np.nan], 'b5_mean':[np.nan], 'b5_count':[np.nan], 'b5_std':[np.nan], 'b5_median':[np.nan], 'b6_min':[np.nan], 'b6_max':[np.nan], 'b6_mean':[np.nan], 'b6_count':[np.nan], 'b6_std':[np.nan], 'b6_median':[np.nan], 'b7_min':[np.nan], 'b7_max':[np.nan], 'b7_mean':[np.nan], 'b7_count':[np.nan], 'b7_std':[np.nan], 'b7_median':[np.nan], 'b8_min':[np.nan], 'b8_max':[np.nan], 'b8_mean':[np.nan], 'b8_count':[np.nan], 'b8_std':[np.nan], 'b8_median':[np.nan], 'b9_min':[np.nan], 'b9_max':[np.nan], 'b9_mean':[np.nan], 'b9_count':[np.nan], 'b9_std':[np.nan], 'b9_median':[np.nan], 'b10_min':[np.nan], 'b10_max':[np.nan], 'b10_mean':[np.nan], 'b10_count':[np.nan], 'b10_std':[np.nan], 'b10_median':[np.nan], 'b11_min':[np.nan], 'b11_max':[np.nan], 'b11_mean':[np.nan], 'b11_count':[np.nan], 'b11_std':[np.nan], 'b11_median':[np.nan], 'b12_min':[np.nan], 'b12_max':[np.nan], 'b12_mean':[np.nan], 'b12_count':[np.nan], 'b12_std':[np.nan], 'b12_median':[np.nan], 'b13_min':[np.nan], 'b13_max':[np.nan], 'b13_mean':[np.nan], 'b13_count':[np.nan], 'b13_std':[np.nan], 'b13_median':[np.nan], 'b14_min':[np.nan], 'b14_max':[np.nan], 'b14_mean':[np.nan], 'b14_count':[np.nan], 'b14_std':[np.nan], 'b14_median':[np.nan], 'b15_min':[np.nan], 'b15_max':[np.nan], 'b15_mean':[np.nan], 'b15_count':[np.nan], 'b15_std':[np.nan], 'b15_median':[np.nan], 'b16_min':[np.nan], 'b16_max':[np.nan], 'b16_mean':[np.nan], 'b16_count':[np.nan], 'b16_std':[np.nan], 'b16_median':[np.nan], 'b17_min':[np.nan], 'b17_max':[np.nan], 'b17_mean':[np.nan], 'b17_count':[np.nan], 'b17_std':[np.nan], 'b17_median':[np.nan]})], ignore_index=True)).head(len(original)) training_stats = seventeen_band_stats(training_filtered) #onfirm df was updated training_stats
For each image we opened calculate statistics for all bands, and classes
for image in image_set: dataA = image_root + image + "_A.tif" dataM = image_root + image + "_M.tif" dataP = image_root + image + "_P.tif" print("Image: " + image) for cl in class_list: print("\tClass: " + str(cl)) subset = training_stats[training_stats['ImageId'].str.contains(image)] subset = subset[subset['ClassType'] == cl] subset = subset[:min(len(subset), 25)] # Limit samples if(len(subset) > 0): band_1 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataP, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_2 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=2, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_3 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=3, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_4 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=5, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_5 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=7, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_6 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_7 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=4, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_8 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=6, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_9 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataM, band=8, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_10 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=1, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_11 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=2, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_12 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=3, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_13 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=4, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_14 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=5, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_15 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=6, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_16 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=7, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) band_17 = np.array(pd.DataFrame(zonal_stats(subset.geometry, dataA, band=8, nodata=-999, stats=['min', 'max', 'mean', 'count', 'std', 'median']))) training_stats.loc[subset.index, ['b1_min', 'b1_max', 'b1_mean', 'b1_count', 'b1_std', 'b1_median']] = band_1[:,:] training_stats.loc[subset.index, ['b2_min', 'b2_max', 'b2_mean', 'b2_count', 'b2_std', 'b2_median']] = band_2[:,:] training_stats.loc[subset.index, ['b3_min', 'b3_max', 'b3_mean', 'b3_count', 'b3_std', 'b3_median']] = band_3[:,:] training_stats.loc[subset.index, ['b4_min', 'b4_max', 'b4_mean', 'b4_count', 'b4_std', 'b4_median']] = band_4[:,:] training_stats.loc[subset.index, ['b5_min', 'b5_max', 'b5_mean', 'b5_count', 'b5_std', 'b5_median']] = band_5[:,:] training_stats.loc[subset.index, ['b6_min', 'b6_max', 'b6_mean', 'b6_count', 'b6_std', 'b6_median']] = band_6[:,:] training_stats.loc[subset.index, ['b7_min', 'b7_max', 'b7_mean', 'b7_count', 'b7_std', 'b7_median']] = band_7[:,:] training_stats.loc[subset.index, ['b8_min', 'b8_max', 'b8_mean', 'b8_count', 'b8_std', 'b8_median']] = band_8[:,:] training_stats.loc[subset.index, ['b9_min', 'b9_max', 'b9_mean', 'b9_count', 'b9_std', 'b9_median']] = band_9[:,:] training_stats.loc[subset.index, ['b10_min', 'b10_max', 'b10_mean', 'b10_count', 'b10_std', 'b10_median']] = band_10[:,:] training_stats.loc[subset.index, ['b11_min', 'b11_max', 'b11_mean', 'b11_count', 'b11_std', 'b11_median']] = band_11[:,:] training_stats.loc[subset.index, ['b12_min', 'b12_max', 'b12_mean', 'b12_count', 'b12_std', 'b12_median']] = band_12[:,:] training_stats.loc[subset.index, ['b14_min', 'b14_max', 'b14_mean', 'b14_count', 'b14_std', 'b14_median']] = band_14[:,:] training_stats.loc[subset.index, ['b15_min', 'b15_max', 'b15_mean', 'b15_count', 'b15_std', 'b15_median']] = band_15[:,:] training_stats.loc[subset.index, ['b16_min', 'b16_max', 'b16_mean', 'b16_count', 'b16_std', 'b16_median']] = band_16[:,:] training_stats.loc[subset.index, ['b17_min', 'b17_max', 'b17_mean', 'b17_count', 'b17_std', 'b17_median']] = band_17[:,:] print("\t" + str(len(subset)) + " samplels")
Remove null polygons (too small to have values)
training_cleaned = training_stats[training_stats['b17_count'] > 0]
Pickle your results for later use
training_cleaned.to_pickle(lab_root + "zonal_stats.pkl")
Classification
from sklearn.model_selection import StratifiedKFold from sklearn import svm from sklearn.preprocessing import normalize
with open('training_stats.pkl', 'rb') as handle: stats = pickle.load(handle) #stats = stats.drop({"ImageId", "geometry"}, axis=1) stats = stats.loc[:, stats.columns != 'geometry'] stats = stats.loc[:, stats.columns != 'ImageId']
class_list = ['1.0', '2'] filter_stats = stats[stats['ClassType'].astype(str).str.match('|'.join(class_list))]
cor = filter_stats.corr() correlation_target = abs(cor['ClassType']) relevant_features = correlation_target.sort_values(ascending=False)[1:3] col_name = relevant_features.index
plt.figure(figsize=(15, 12)) plt.scatter(filter_stats[col_name[0]], filter_stats[col_name[1]], c=filter_stats['ClassType']) plt.colorbar() plt.xlabel(col_name[0]) plt.ylabel(col_name[1]) plt.show()
features = normalize(np.array(filter_stats).astype(int)) labels = np.array(filter_stats['ClassType']) folds = 4 kfold = StratifiedKFold(n_splits=folds, shuffle=True) fold = 0 model = np.empty([folds], dtype=object) results = np.empty([folds], dtype=float) for train_index, test_index in kfold.split(X=features, y=labels): print("TRAIN:", train_index, "TEST:", test_index) train_features, train_labels = features[train_index], labels[train_index] test_features, test_labels = features[test_index], labels[test_index] print(train_features.shape) print(train_labels) SVM = svm.LinearSVC() SVM.fit(train_features, train_labels) # Run model on test data predictions = SVM.predict(test_features) # Convert results to pandas series, fill in missing classes with 0 values y_actual = pd.Series(test_labels, name="Actual") y_predicted = pd.Series(predictions, name="Predicted") accuracy = sum(y_actual == y_predicted)/y_actual.size*100 model[fold] = SVM results[fold] = accuracy fold += 1 print("Accuracy: " + str(round(accuracy, 1)) + "%") # Calculate confusion matrix (Margins add All summary) confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True, dropna=False).reindex(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], fill_value=0) print() print(confusion) print() print(results) print("Overall Average: " + str(round(np.mean(results),2)) + "\tStd.Dev: " + str(round(np.std(results),2)))
from sklearn.ensemble import RandomForestClassifier features = normalize(np.array(filter_stats).astype(int)) labels = np.array(filter_stats['ClassType']) folds = 4 kfold = StratifiedKFold(n_splits=folds, shuffle=True) fold = 0 model = np.empty([folds], dtype=object) results = np.empty([folds], dtype=float) for train_index, test_index in kfold.split(X=features, y=labels): print("TRAIN:", train_index, "TEST:", test_index) train_features, train_labels = features[train_index], labels[train_index] test_features, test_labels = features[test_index], labels[test_index] print(train_features.shape) print(train_labels) # Initialize random forest, and set number of trees to be used in ensemble rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1) rf.fit(train_features, train_labels) # Run model on test data predictions = rf.predict(test_features) # Convert results to pandas series, fill in missing classes with 0 values y_actual = pd.Series(test_labels, name="Actual") y_predicted = pd.Series(predictions, name="Predicted") accuracy = sum(y_actual == y_predicted)/y_actual.size*100 model[fold] = rf results[fold] = accuracy fold += 1 print("Accuracy: " + str(round(accuracy, 1)) + "%") # Calculate confusion matrix (Margins add All summary) confusion = pd.crosstab(y_actual, y_predicted, rownames=['Actual'], colnames=['Predicted'], margins=True, dropna=False).reindex(columns=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], fill_value=0) print() print(confusion) print() print(results) print("Overall Average: " + str(round(np.mean(results),2)) + "\tStd.Dev: " + str(round(np.std(results),2)))
from sklearn.tree import export_graphviz export_graphviz(model[0].estimators_[0], out_file='random_forest.dot', class_names = ['Building', 'Misc, Structure', 'Road', 'Track', 'Trees', 'Crops', 'Swift Water', 'Still Water', 'Truck', 'Car'], filled=True, rounded=True)

Assignment
- Implement above classification using 3-band data (us the _M files from sixteen band loading only the first 3 channels), you will need to update data structures and the way images are open to accommodate this. Limit images 8, max samples/class/image 250, use all class
- Submit entire notebook with results (file download)
- Make sue Confusion Matrices and Overall Accuracy are printed).
- Answer the following based upon the 17 band data in the uploaded pickle. And with the following classes 1 – ‘Building’, 2 – ‘Misc, Structure’, 3 – ‘Road’, 4 – ‘Track’, 5 – ‘Trees’, 6 – ‘Crops’, 7 – ‘Swift Water’, 8 – ‘Still Water’, 9 – ‘Truck’, 10 – ‘Car’
- What are the top two statistics correlating between Swift Water and Still Water?
- What is the change in SVM accuracy at discriminating between the top two determinants, and the top 10 determinints.
Tip: Change relevant_features = correlation_target.sort_values(ascending=False)[1:3] to relevant_features = correlation_target.sort_values(ascending=False)[1:11] - Try running both SVM and Random Forest with all determinants (relevant_features = correlation_target.sort_values(ascending=False)[1:]). Do they run? how does accuracy compare.
- For random forest, using the top 4 determinants, and all classes. Compare accuracy between max_depth=3, 4, and 5.