Source code for raster4ml.extraction

import os
import glob
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterio.mask import mask
from tqdm import tqdm
from . import utils


[docs]def extract_by_points(image_path, shape_path, unique_id): """Extract value by a point shapefile. The function will extract the pixel values underneath each point in a given point shapefile. Parameters ---------- image_path : str Path of the image. shape_path : str Path of the shapefile. unique_id : str A unique column in the shapefile which will be retained as the id. Returns ------- pandas dataframe A pandas dataframe containing all the values per id. Raises ------ ValueError The shapefile must be either Point or MultiPoint. """ # Open files src = rasterio.open(image_path) img = src.read(1) shape = gpd.read_file(shape_path) # Check the geometry types of the shape geoms = shape.geom_type.values if all((geoms == 'Point') | (geoms == 'MultiPoint')): pass else: raise ValueError( "The shapefile must be either Point or MultiPoint.") # Image names image_name = os.path.basename(image_path).split('.')[0] # Check if the CRS of shapefile and raster data matches or not shape = utils.check_projection(src, shape) # Check the type of geometry in shape pixel_values = {} for i, point in enumerate(shape['geometry']): x = point.xy[0][0] y = point.xy[1][0] row, col = src.index(x, y) value = img[row, col] pixel_values[shape.loc[i, f'{unique_id}']] = [value] pixel_values = pd.DataFrame(pixel_values).T pixel_values.columns = [image_name] # Close the rasterio src.close() return pixel_values
[docs]def get_duplicated_columns(df): """Get the columns which has values all the same. Parameters ---------- df : pandas DataFrame The pandas dataframe to check. Returns ------- list Lis of columns that needs to be removed. """ df = df.fillna(1, axis=1) # Replace all nans with 1 duplicates = [] for col in df.columns: if (df[col] == df[col][0]).all(): duplicates.append(col) return duplicates
[docs]def batch_extract_by_points(image_paths, shape_path, unique_id): """Batch extract values from a set of raster data using point shapefile. Parameters ---------- image_paths : list List of image paths. shape_path : str Path of the shapefile. unique_id : str A unique column in the shapefile which will be retained as the id. Returns ------- pandas dataframe A pandas dataframe containing all the values per id. """ pixel_values_df = [] for image_path in tqdm(image_paths): pixel_values = extract_by_points(image_path, shape_path, unique_id) pixel_values_df.append(pixel_values) pixel_values_df = pd.concat(pixel_values_df, axis=1) # Check if there are duplicate columns cols_to_remove = get_duplicated_columns(pixel_values_df) if len(cols_to_remove) > 0: pixel_values_df = pixel_values_df.drop(columns=cols_to_remove) print(f'{len(cols_to_remove)} columns were removed from the dataframe as they had duplicated values.') return pixel_values_df
[docs]def extract_by_polygons(image_path, shape_path, unique_id, statistics='all', prefix=None): """Extract value from a raster data using a polygon shapefile. Similar to Zonal Statistics. Parameters ---------- image_path : str Path of the image. shape_path : str Path of the shapefile. unique_id : str A unique column in the shapefile which will be retained as the id. statistics : str or list List of statistics to be calculated if shape is polygon. Accepted statsitcs are either 'all' or a list containing follwoing statistics: 'mean', 'median', 'mode', 'sum', 'min', 'max', 'std', 'range', 'iqr', 'unique'. If only one statistic to be calculated, that should be inside a list. For example, if only 'mean' is to be calculated, it should be given as ['mean']. prefix : str, optional If predix is given, then the prefix will be used in front of the statistics name within the final dataframe column, by default None Returns ------- pandas dataframe A pandas dataframe containing all the statistics values per id. Raises ------ ValueError The shapefile must be either Polygon or MultiPolygon. """ # Open files src = rasterio.open(image_path) shape = gpd.read_file(shape_path) # Check the geometry types of the shape geoms = shape.geom_type.values if all((geoms == 'Polygon') | (geoms == 'MultiPolygon')): pass else: raise ValueError( "The shapefile must be either Polygon or MultiPolygon.") # Check if the CRS of shapefile and raster data matches or not shape = utils.check_projection(src, shape) stats = {} for i, polygon in enumerate(shape['geometry']): mask_img, _ = mask(src, [polygon], nodata=np.nan, crop=True) mask_img = mask_img.reshape(-1) if statistics == 'all': statistics = ['mean', 'median', 'sum', 'min', 'max', 'std', 'range', 'iqr', 'unique'] # removed mode # Check if all values are nan if np.isnan(mask_img).all(): stats_values = [np.nan]*len(statistics) else: mask_img = mask_img[~np.isnan(mask_img)] stats_values = [] if 'mean' in statistics: stats_values.append(np.mean(mask_img)) if 'median' in statistics: stats_values.append(np.median(mask_img)) #if 'mode' in statistics: # stats_values.append(np.bincount(mask_img).argmax()) if 'sum' in statistics: stats_values.append(np.sum(mask_img)) if 'min' in statistics: stats_values.append(np.min(mask_img)) if 'max' in statistics: stats_values.append(np.max(mask_img)) if 'std' in statistics: stats_values.append(np.std(mask_img)) if 'range' in statistics: stats_values.append(np.max(mask_img)-np.min(mask_img)) if 'iqr' in statistics: stats_values.append(np.subtract( *np.percentile(mask_img, [75, 25]))) if 'unique' in statistics: stats_values.append(np.unique(mask_img).shape[0]) stats[shape.loc[i, f'{unique_id}']] = stats_values stats = pd.DataFrame(stats).T if prefix is None: stats.columns = statistics else: stats.columns = [ f'{str(prefix)}_{statistic}' for statistic in statistics] # Close rasterio src.close() return stats
[docs]def batch_extract_by_polygons(image_paths, shape_path, unique_id, statistics='all'): """Batch extract value by a polygon shapefile from a given image paths. Similar to zonal statistics. Parameters ---------- image_paths : list List of image paths. shape_path : str Path of the shapefile. unique_id : str A unique column in the shapefile which will be retained as the id. statistics : str or list List of statistics to be calculated if shape is polygon. Accepted statsitcs are either 'all' or a list containing follwoing statistics: 'mean', 'median', 'mode', 'sum', 'min', 'max', 'std', 'range', 'iqr', 'unique'. If only one statistic to be calculated, that should be inside a list. For example, if only 'mean' is to be calculated, it should be given as ['mean']. prefix : str, optional If predix is given, then the prefix will be used in front of the statistics name within the final dataframe column, by default None Returns ------- pandas dataframe A pandas dataframe containing all the statistics values per id. Each column name will be made through automatically adding a prefix (which is the filename of each image) and the corresponding statistics. """ stats_df = [] image_paths = glob.glob(os.path.join(image_paths, "*.tif")) for image_path in tqdm(image_paths): prefix = os.path.basename(image_path).split('.')[0] stats = extract_by_polygons(image_path, shape_path, unique_id, statistics, prefix=prefix) stats_df.append(stats) stats_df = pd.concat(stats_df, axis=1) # Check if there are duplicate columns cols_to_remove = get_duplicated_columns(stats_df) if len(cols_to_remove) > 0: stats_df = stats_df.drop(columns=cols_to_remove) print(f'{len(cols_to_remove)} columns were removed from the dataframe as they had duplicated values.') return stats_df
[docs]def clip_by_polygons(image_path, shape_path, unique_id, out_dir, out_type='numpy'): """Clip a raster image by a polygon shapefile. Based on the geometry of each polygon, the function will clip the images and save it in a given directory with a unique name. Parameters ---------- image_path : str Path of the image. shape_path : str Path of the shapefile. unique_id : str A unique column in the shapefile which will be retained as the id. out_dir : str Path of the directory where the clipped images will be saved. out_type : str, optional The type of output data. It can be either 'numpy' or 'tif', by default 'numpy' Returns ------- None Saves the clipped data. Raises ------ ValueError The shapefile must be either Polygon or MultiPolygon. """ # Open files src = rasterio.open(image_path) shape = gpd.read_file(shape_path) # Check the geometry types of the shape geoms = shape.geom_type.values if all((geoms == 'Polygon') | (geoms == 'MultiPolygon')): pass else: raise ValueError( "The shapefile must be either Polygon or MultiPolygon.") # Check if the CRS of shapefile and raster data matches or not shape = utils.check_projection(src, shape) for i, polygon in enumerate(shape['geometry']): mask_img, transform = mask(src, [polygon], nodata=0, crop=True) if out_type == 'numpy': out_path = os.path.join(out_dir, str( shape.loc[i, f'{unique_id}'])+'.npy') mask_img = np.moveaxis(mask_img, 0, 2) np.save(out_path, mask_img) elif out_type == 'tif': out_path = os.path.join(out_dir, str( shape.loc[i, f'{unique_id}'])+'.tif') utils.save_raster(src, mask_img, out_path, driver='GTiff', nodata=0, width=mask_img.shape[2], height=mask_img.shape[1], transform=transform) return None