Source code for raster4ml.extraction

import os
import glob
import rasterio
import numpy as np
import pandas as pd
import geopandas as gpd
from rasterio.mask import mask
from tqdm import tqdm
from . import utils


[docs]def extract_by_points(image_path,
                      shape_path,
                      unique_id):
    """Extract value by a point shapefile.
    
    The function will extract the pixel values underneath each point
    in a given point shapefile.

    Parameters
    ----------
    image_path : str
        Path of the image.
    shape_path : str
        Path of the shapefile.
    unique_id : str
        A unique column in the shapefile which will be retained as
        the id.

    Returns
    -------
    pandas dataframe
        A pandas dataframe containing all the values per id.

    Raises
    ------
    ValueError
        The shapefile must be either Point or MultiPoint.
    """
    # Open files
    src = rasterio.open(image_path)
    img = src.read(1)
    shape = gpd.read_file(shape_path)

    # Check the geometry types of the shape
    geoms = shape.geom_type.values
    if all((geoms == 'Point') | (geoms == 'MultiPoint')):
        pass
    else:
        raise ValueError(
            "The shapefile must be either Point or MultiPoint.")

    # Image names
    image_name = os.path.basename(image_path).split('.')[0]

    # Check if the CRS of shapefile and raster data matches or not
    shape = utils.check_projection(src, shape)

    # Check the type of geometry in shape

    pixel_values = {}
    for i, point in enumerate(shape['geometry']):
        x = point.xy[0][0]
        y = point.xy[1][0]
        row, col = src.index(x, y)
        value = img[row, col]
        pixel_values[shape.loc[i, f'{unique_id}']] = [value]
    pixel_values = pd.DataFrame(pixel_values).T
    pixel_values.columns = [image_name]
    # Close the rasterio
    src.close()
    return pixel_values


[docs]def get_duplicated_columns(df):
    """Get the columns which has values all the same.

    Parameters
    ----------
    df : pandas DataFrame
        The pandas dataframe to check.   

    Returns
    -------
    list
        Lis of columns that needs to be removed.
    """    
    df = df.fillna(1, axis=1) # Replace all nans with 1
    duplicates = []
    for col in df.columns:
        if (df[col] == df[col][0]).all():
            duplicates.append(col)
    return duplicates

[docs]def batch_extract_by_points(image_paths,
                            shape_path,
                            unique_id):
    """Batch extract values from a set of raster data using point
    shapefile.

    Parameters
    ----------
    image_paths : list
        List of image paths.
    shape_path : str
        Path of the shapefile.
    unique_id : str
        A unique column in the shapefile which will be retained as
        the id.   

    Returns
    -------
    pandas dataframe
        A pandas dataframe containing all the values per id.
    """
    pixel_values_df = []
    for image_path in tqdm(image_paths):
        pixel_values = extract_by_points(image_path, shape_path, unique_id)
        pixel_values_df.append(pixel_values)
    pixel_values_df = pd.concat(pixel_values_df, axis=1)
    
    # Check if there are duplicate columns
    cols_to_remove = get_duplicated_columns(pixel_values_df)
    if len(cols_to_remove) > 0:
        pixel_values_df = pixel_values_df.drop(columns=cols_to_remove)
        print(f'{len(cols_to_remove)} columns were removed from the dataframe as they had duplicated values.')
    return pixel_values_df


[docs]def extract_by_polygons(image_path,
                        shape_path,
                        unique_id,
                        statistics='all',
                        prefix=None):
    """Extract value from a raster data using a polygon shapefile.
    Similar to Zonal Statistics.

    Parameters
    ----------
    image_path : str
        Path of the image.
    shape_path : str
        Path of the shapefile.
    unique_id : str
        A unique column in the shapefile which will be retained as
        the id.
    statistics : str or list
        List of statistics to be calculated if shape is polygon.
        Accepted statsitcs are either 'all' or a list containing 
        follwoing statistics:
        'mean', 'median', 'mode', 'sum', 'min', 'max', 'std', 'range',
        'iqr', 'unique'.
        If only one statistic to be calculated, that should be inside
        a list. For example, if only 'mean' is to be calculated, it
        should be given as ['mean'].
    prefix : str, optional
        If predix is given, then the prefix will be used in front of
        the statistics name within the final dataframe column,
        by default None

    Returns
    -------
    pandas dataframe
        A pandas dataframe containing all the statistics values per
        id.

    Raises
    ------
    ValueError
        The shapefile must be either Polygon or MultiPolygon.
    """
    # Open files
    src = rasterio.open(image_path)
    shape = gpd.read_file(shape_path)

    # Check the geometry types of the shape
    geoms = shape.geom_type.values
    if all((geoms == 'Polygon') | (geoms == 'MultiPolygon')):
        pass
    else:
        raise ValueError(
            "The shapefile must be either Polygon or MultiPolygon.")

    # Check if the CRS of shapefile and raster data matches or not
    shape = utils.check_projection(src, shape)

    stats = {}

    for i, polygon in enumerate(shape['geometry']):
        mask_img, _ = mask(src, [polygon], nodata=np.nan, crop=True)
        mask_img = mask_img.reshape(-1)

        if statistics == 'all':
            statistics = ['mean', 'median', 'sum', 'min',
                          'max', 'std', 'range', 'iqr', 'unique'] # removed mode

        # Check if all values are nan
        if np.isnan(mask_img).all():
            stats_values = [np.nan]*len(statistics)
        else:
            mask_img = mask_img[~np.isnan(mask_img)]
            stats_values = []
            if 'mean' in statistics:
                stats_values.append(np.mean(mask_img))
            if 'median' in statistics:
                stats_values.append(np.median(mask_img))
            #if 'mode' in statistics:
            #    stats_values.append(np.bincount(mask_img).argmax())
            if 'sum' in statistics:
                stats_values.append(np.sum(mask_img))
            if 'min' in statistics:
                stats_values.append(np.min(mask_img))
            if 'max' in statistics:
                stats_values.append(np.max(mask_img))
            if 'std' in statistics:
                stats_values.append(np.std(mask_img))
            if 'range' in statistics:
                stats_values.append(np.max(mask_img)-np.min(mask_img))
            if 'iqr' in statistics:
                stats_values.append(np.subtract(
                    *np.percentile(mask_img, [75, 25])))
            if 'unique' in statistics:
                stats_values.append(np.unique(mask_img).shape[0])

        stats[shape.loc[i, f'{unique_id}']] = stats_values

    stats = pd.DataFrame(stats).T
    if prefix is None:
        stats.columns = statistics
    else:
        stats.columns = [
            f'{str(prefix)}_{statistic}' for statistic in statistics]

    # Close rasterio
    src.close()

    return stats


[docs]def batch_extract_by_polygons(image_paths,
                              shape_path,
                              unique_id,
                              statistics='all'):
    """Batch extract value by a polygon shapefile from a given image
    paths. Similar to zonal statistics.

    Parameters
    ----------
    image_paths : list
        List of image paths.
    shape_path : str
        Path of the shapefile.
    unique_id : str
        A unique column in the shapefile which will be retained as
        the id.
    statistics : str or list
        List of statistics to be calculated if shape is polygon.
        Accepted statsitcs are either 'all' or a list containing 
        follwoing statistics:
        'mean', 'median', 'mode', 'sum', 'min', 'max', 'std', 'range',
        'iqr', 'unique'.
        If only one statistic to be calculated, that should be inside
        a list. For example, if only 'mean' is to be calculated, it
        should be given as ['mean'].
    prefix : str, optional
        If predix is given, then the prefix will be used in front of
        the statistics name within the final dataframe column,
        by default None 

    Returns
    -------
    pandas dataframe
        A pandas dataframe containing all the statistics values per
        id. Each column name will be made through automatically adding
        a prefix (which is the filename of each image) and the
        corresponding statistics.
    """
    stats_df = []
    image_paths = glob.glob(os.path.join(image_paths, "*.tif"))
    for image_path in tqdm(image_paths):
        prefix = os.path.basename(image_path).split('.')[0]
        stats = extract_by_polygons(image_path,
                                    shape_path,
                                    unique_id,
                                    statistics, 
                                    prefix=prefix)
        stats_df.append(stats)
    stats_df = pd.concat(stats_df, axis=1)

    # Check if there are duplicate columns
    cols_to_remove = get_duplicated_columns(stats_df)
    if len(cols_to_remove) > 0:
        stats_df = stats_df.drop(columns=cols_to_remove)
        print(f'{len(cols_to_remove)} columns were removed from the dataframe as they had duplicated values.')
    return stats_df


[docs]def clip_by_polygons(image_path,
                     shape_path,
                     unique_id,
                     out_dir,
                     out_type='numpy'):
    """Clip a raster image by a polygon shapefile.

    Based on the geometry of each polygon, the function will clip the
    images and save it in a given directory with a unique name.

    Parameters
    ----------
    image_path : str
        Path of the image.
    shape_path : str
        Path of the shapefile.
    unique_id : str
        A unique column in the shapefile which will be retained as
        the id.
    out_dir : str
        Path of the directory where the clipped images will be saved.
    out_type : str, optional
        The type of output data. It can be either 'numpy' or 'tif',
        by default 'numpy'

    Returns
    -------
    None
        Saves the clipped data.

    Raises
    ------
    ValueError
        The shapefile must be either Polygon or MultiPolygon.
    """
    # Open files
    src = rasterio.open(image_path)
    shape = gpd.read_file(shape_path)

    # Check the geometry types of the shape
    geoms = shape.geom_type.values
    if all((geoms == 'Polygon') | (geoms == 'MultiPolygon')):
        pass
    else:
        raise ValueError(
            "The shapefile must be either Polygon or MultiPolygon.")

    # Check if the CRS of shapefile and raster data matches or not
    shape = utils.check_projection(src, shape)

    for i, polygon in enumerate(shape['geometry']):
        mask_img, transform = mask(src, [polygon], nodata=0, crop=True)
        if out_type == 'numpy':
            out_path = os.path.join(out_dir, str(
                shape.loc[i, f'{unique_id}'])+'.npy')
            mask_img = np.moveaxis(mask_img, 0, 2)
            np.save(out_path, mask_img)
        elif out_type == 'tif':
            out_path = os.path.join(out_dir, str(
                shape.loc[i, f'{unique_id}'])+'.tif')
            utils.save_raster(src, mask_img, out_path,
                              driver='GTiff', nodata=0,
                              width=mask_img.shape[2],
                              height=mask_img.shape[1],
                              transform=transform)
    return None