Source code for w4h.layers

"""The Layers module contains functions for splitting data into a layered model
and for interpolating data within the layers
"""

import datetime
import inspect
import numbers
import os
import pathlib

import xarray as xr
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rioxarray as rxr
from scipy import interpolate
from scipy.spatial import Voronoi, voronoi_plot_2d
from shapely import Point, Polygon, wkt

import w4h
from w4h import logger_function, verbose_print



# Get layer depths of each layer, based on precalculated layer thickness

[docs]
def get_layer_depths(df_with_depths, surface_elev_col='SURFACE_ELEV',
                     layer_thick_col='LAYER_THICK', layers=9, log=False):

    """Function to calculate depths and elevations of each model layer
    at each well based on surface elevation, bedrock elevation,
    and number of layers/layer thickness

    Parameters
    ----------
    df_with_depths : pandas.DataFrame
        DataFrame containing well metdata
    layers : int, default=9
        Number of layers. This should correlate with
        get_drift_thick() input parameter, if drift thickness was calculated
        using that function, by default 9.
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing new columns for depth to/elevation of layers.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    for layer in range(0, layers):  # For each layer
        # Make column names
        depthColName = 'DEPTH_LAYER'+str(layer+1)

        # Calculate depth to each layer at each well, in feet and meters
        df_with_depths[depthColName] = df_with_depths[layer_thick_col] * layer

    for layer in range(0, layers):  # For each layer
        elevColName = 'ELEV_LAYER'+str(layer+1)
        # elevMColName = 'ELEV_M_LAYER'+str(layer)

        df_with_depths[elevColName] = df_with_depths[surface_elev_col] - df_with_depths[layer_thick_col] * layer

    return df_with_depths



# Function to Merge tables

[docs]
def merge_metadata(data_df, header_df,
                   well_id_col='API_NUMBER', data_cols=None,
                   header_cols=None, auto_pick_cols=False,
                   drop_duplicate_cols=True,
                   log=False, verbose=False, **kwargs):

    """Function to merge tables, intended for merging metadata and data tables

    Parameters
    ----------
    data_df : pandas.DataFrame
        "Left" dataframe, intended for this purpose to be
        dataframe with main data, but can be anything
    header_df : pandas.DataFrame
        "Right" dataframe, intended for this purpose to be
        dataframe with metadata, but can be anything
    data_cols : list, optional
        List of strings of column names, for columns
        to be included after join from "left" table (data table).
        If None, all columns are kept, by default None
    header_cols : list, optional
        List of strings of columns names, for columns to be included
        in merged table after merge from "right" table (metadata).
        If None, all columns are kept, by default None
    auto_pick_cols : bool, default = False
        Whether to autopick the columns from the metadata table.
        If True, the following column names are kept:
        [`well_id_col`, 'LATITUDE', 'LONGITUDE', 'BEDROCK_ELEV',
        'SURFACE_ELEV', 'BEDROCK_DEPTH', 'LAYER_THICK'], by default False.
    drop_duplicate_cols : bool, optional
        If True, drops duplicate columns from the tables so that
        columns do not get renamed upon merge, by default True.
    log : bool, default = False
        Whether to log inputs and outputs to log file.
    **kwargs
        kwargs that are passed directly to pd.merge().
        By default, the 'on' and 'how' parameters are defined as
        on=`well_id_col` and how='inner'.

    Returns
    -------
    mergedTable : pandas.DataFrame
        Merged dataframe
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    if verbose:
        verbose_print(merge_metadata, locals(), exclude_params=['data_df',
                                                                'header_df'])
    if header_df is None:
        # Figure out which columns to include
        if data_cols is None:
            # If not specified, get all the cols
            data_cols = data_df.columns
        else:
            if header_cols is not None:
                data_cols = data_cols + header_cols

        data_df = data_df[data_cols]

        mergedTable = data_df
    else:
        if auto_pick_cols:
            header_cols = [well_id_col, 'LATITUDE', 'LONGITUDE',
                           'BEDROCK_ELEV', 'SURFACE_ELEV', 'BEDROCK_DEPTH',
                           'LAYER_THICK']

            for c in header_df.columns:
                if c.startswith('ELEV_') or c.startswith('DEPTH'):
                    header_cols.append(c)
                if '_PROJ' in c:
                    header_cols.append(c)
            header_cols.append('geometry')
        elif header_cols is None:
            header_cols = header_df.columns
        else:
            header_cols = header_cols

        # If not specified, get all the cols
        if data_cols is None:
            data_cols = data_df.columns

        # Defults for on and how
        if 'on' not in kwargs.keys():
            kwargs['on'] = well_id_col

        if 'how' not in kwargs.keys():
            kwargs['how'] = 'inner'

        # Drop duplicate columns
        if drop_duplicate_cols:
            header_colCopy = header_cols.copy()
            remCount = 0
            for i, c in enumerate(header_colCopy):
                if c in data_cols and c != kwargs['on']:
                    if verbose:
                        print('Removing {} (duplicate columns) from data.'.format(header_cols[i-remCount]))
                    header_cols.pop(i - remCount)
                    remCount += 1

        leftTable_join = data_df[data_cols]
        rightTable_join = header_df[header_cols]

        mergedTable = pd.merge(left=leftTable_join, right=rightTable_join,
                               **kwargs)
    return mergedTable



# Interpolate layers to model model_grid

[docs]
def layer_interp(points, model_grid, layers=None, interp_kind='nearest',
                 surface_grid=None, bedrock_grid=None,
                 layer_thick_grid=None, drift_thick_grid=None,
                 return_type='dataset', export_dir=None,
                 target_col='TARG_THICK_PER', layer_col='LAYER',
                 xcol=None, ycol=None, xcoord='x', ycoord='y',
                 log=False, verbose=False, **kwargs):

    """Function to interpolate results (by default TARG_THICK_PER,
    or Target thickness percent per layer).
    Converts dataframe points to gridded data.
    Results are saved to Model_Layer variable
    in output xarray.Dataset, by default
    
    This function uses the scipy.interpolate module for interpolation.

     Different interpolation methods may be used by specifying `interp_kind=`:
     * `'Nearest'`: Nearest neighbor (fastest).
     Uses scipy.interpolate.NearestNDInterpolator()
     * `'Linear'`: Linear interpolation
     Uses scipy.interpolate.LinearNDInterpolator()
     * `'Inter2d'`: Spline interpolation
     Uses scipy.interpolate.bisplrep()
     * `'CloughTocher`': Cubic interpolation using clough-tocher method
     Uses scipy.interpolate.CloughTocher2DInterpolator()
     * `'Radial basis function'`: Radial basis function
     Uses scipy.interpolate.RBFInterpolator()

    Parameters
    ----------
    points : list
        List containing pandas dataframes or geopandas geoadataframes containing the point data. Should be resDF_list output from layer_target_thick().
    model_grid : xr.DataArray or xr.Dataset
        Xarray DataArray or DataSet with the coordinates/spatial reference of the output model_grid to interpolate to
    layers : int, default=None
        Number of layers for interpolation. If None, uses the length ofthe points list to determine number of layers. By default None.
    interp_kind : str, {'nearest', 'interp2d','linear', 'cloughtocher', 'radial basis function'}
        Type of interpolation to use. See scipy.interpolate N-D scattered. Values can be any of the following (also shown in "kind" column of N-D scattered section of table here: https://docs.scipy.org/doc/scipy/tutorial/interpolate.html). By default 'nearest'
    return_type : str, {'dataset', 'dataarray'}
        Type of xarray object to return, either xr.DataArray or xr.Dataset, by default 'dataset.'
    export_dir : str or pathlib.Path, default=None
        Export directory for interpolated grids, using w4h.export_grids(). If None, does not export, by default None.
    target_col : str, default = 'TARG_THICK_PER'
        Name of column in points containing data to be interpolated, by default 'TARG_THICK_PER'.
    layer_col : str, default = 'Layer'
        Name of column containing layer number. Not currently used, by default 'LAYER'
    xcol : str, default = 'None'
        Name of column containing x coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
    ycol : str, default = 'None'
        Name of column containing y coordinates. If None, will look for 'geometry' column, as in a geopandas.GeoDataframe. By default None
    xcoord : str, default='x'
        Name of x coordinate in model_grid, used to extract x values of model_grid, by default 'x'
    ycoord : str, default='y'
        Name of y coordinate in model_grid, used to extract x values of model_grid, by default 'y'
    log : bool, default = True
        Whether to log inputs and outputs to log file.
    **kwargs
        Keyword arguments to be read directly into whichever scipy.interpolate function is designated by the interp_kind parameter.

    Returns
    -------
    interp_data : xr.DataArray or xr.Dataset, depending on return_type
        By default, returns an xr.DataArray object with the layers added as a new dimension called Layer. Can also specify return_type='dataset' to return an xr.Dataset with each layer as a separate variable.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    if verbose:
        verbose_print(layer_interp, locals(), exclude_params=['points', 'model_grid'])

    # Possible inputs (casefolded) for different interp methods
    nnList = ['nearest', 'nearest neighbor', 'nearestneighbor', 'near', 'n']
    splineList = ['interp2d', 'interp2', 'interp', 'spline', 'spl', 'sp', 's']
    linList = ['linear', 'lin', 'l']
    ctList = ['clough tocher', 'clough', 'cloughtocher', 'ct', 'c']
    rbfList = ['rbf', 'radial basis', 'radial basis function', 'r', 'radial']
    natneighborList = ['natural', 'natural neighbor', 'naturalneighbor', 'nat', 'natn']
    # Potential future additions:
    #   k-nearest neighbors from scikit-learn?
    #   kriging? (from pykrige or maybe also from scikit-learn)

    # Check and format input model grid
    if not isinstance(model_grid, (xr.DataArray, xr.Dataset)):
        if pathlib.Path(model_grid).exists():
            model_grid = xr.open_dataarray(model_grid)

    # Extract model grid coordinates
    X = np.round(model_grid[xcoord].values, 10)
    Y = np.round(model_grid[ycoord].values, 10)

    # Get number of layers
    # If layers is not specified, use the length of the points list
    if layers is None and (type(points) is list or type(points) is dict):
        layers = len(points)

    if len(points) != layers:
        print('You have specified a different number of layers than what is iterable in the points argument. This may not work properly.')

    if verbose:
        print('\tInterpolating target lithology at each layer:')

    # Loop through each layer and interpolate (2d interpolation within layer)
    daDict = {}
    for lyr in range(1, layers+1):

        # Get points for each layer
        if type(points) is list or type(points) is dict:
            pts = points[lyr-1]
        else:
            pts = points

        # Get x coordinates for each well point
        if xcol is None:
            if 'geometry' in pts.columns:
                dataX = pts['geometry'].x
            else:
                print('xcol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)')
                return
        else:
            dataX = pts[xcol]

        # Get x coordinates for each well point
        if ycol is None:
            if 'geometry' in pts.columns:
                dataY = pts['geometry'].y
            else:
                print('ycol not specified and geometry column not detected (points is not/does not contain geopandas.GeoDataFrame)')
                return
        else:
            dataY = pts[ycol]

        # Get 'z' coordinates (interpolated value) for each well point
        interpVal = pts[target_col]

        # Drop points without x, y, or z data (and maintain consistency)
        dataX = dataX.dropna()
        dataY = dataY.loc[dataX.index]
        dataY = dataY.dropna()
        interpVal = interpVal.loc[dataY.index]
        interpVal = interpVal.dropna()

        dataX = dataX.loc[interpVal.index]
        dataY = dataY.loc[interpVal.index]

        dataX = dataX.reset_index(drop=True)
        dataY = dataY.reset_index(drop=True)
        interpVal = interpVal.reset_index(drop=True)

        # Carry out interpolation
        # Nearest neighbor
        if interp_kind.lower() in nnList:
            interpType = 'Nearest Neighbor'
            X, Y = np.meshgrid(X, Y, sparse=True)  # 2D Grid for interpolation
            dataPoints = np.array(list(zip(dataX, dataY)))
            interp = interpolate.NearestNDInterpolator(dataPoints, interpVal,
                                                       **kwargs)
            Z = interp(X, Y)
            del interp

        # Linear
        elif interp_kind.lower() in linList:
            interpType = 'Linear'
            dataPoints = np.array(list(zip(dataX, dataY)))
            interp = interpolate.LinearNDInterpolator(dataPoints, interpVal,
                                                      **kwargs)
            X, Y = np.meshgrid(X, Y, sparse=True)  # 2D Grid for interpolation
            Z = interp(X, Y)
            del interp

        # Clough-toucher (cubic)
        elif interp_kind.lower() in ctList:
            interpType = 'Clough-Toucher'
            X, Y = np.meshgrid(X, Y, sparse=True)  # 2D Grid for interpolation
            if 'tol' not in kwargs:
                kwargs['tol'] = 1e10
            interp = interpolate.CloughTocher2DInterpolator(list(zip(dataX, dataY)), interpVal, **kwargs)
            Z = interp(X, Y)
            del interp

        # Radial basis function
        elif interp_kind.lower() in rbfList:
            interpType = 'Radial Basis'
            dataXY = np.column_stack((dataX, dataY))
            interp = interpolate.RBFInterpolator(dataXY, interpVal, **kwargs)
            print("Radial Basis Function does not work well with many well-based datasets. Consider instead specifying 'nearest', 'linear', 'spline', or 'clough tocher' for interpolation interp_kind.")
            Z = interp(np.column_stack((X.ravel(), Y.ravel()))).reshape(X.shape)
            del interp

        # Spline
        elif interp_kind.lower() in splineList:
            interpType = 'Spline Interpolation'
            Z = interpolate.bisplrep(dataX, dataY, interpVal, **kwargs)

        elif interp_kind.lower() in natneighborList:
            interpType = 'Natural Neighbor'
            Z = natural_neighbor_interp(points=pts, model_grid=model_grid)
        # Nearest neighbor by default if otherwise not specified
        else:
            if verbose:
                print(f'Specified interpolation (interp_kind={interp_kind}) not recognized, using nearest neighbor.')
                print("\tinterp_kind should be one of: 'nearest', 'linear', 'interp2d', 'cloughtocher', or 'spline'")

            interpType = 'Nearest Neighbor'
            X, Y = np.meshgrid(X, Y, sparse=True)  # 2D Grid for interpolation
            interp = interpolate.NearestNDInterpolator(list(zip(dataX, dataY)),
                                                       interpVal, **kwargs)
            Z = interp(X, Y)
            del interp

        # Create new datarray with new data values, else same as model_grid
        interp_grid = xr.DataArray(
                    data=Z,
                    dims=model_grid.dims,
                    coords=model_grid.coords)

        # Drop if only a single coordinate in "band" dimension
        if 'band' in interp_grid.coords:
            interp_grid = interp_grid.drop_vars('band')

        # Clip to 0-1 if percentage
        if target_col == 'TARG_THICK_PER':
            interp_grid = interp_grid.clip(min=0, max=1, keep_attrs=True)

        # Add coordinate in layer dimension for current layer
        interp_grid = interp_grid.expand_dims(dim='Layer')
        interp_grid = interp_grid.assign_coords(Layer=[lyr])

        # Delete to reduce memory usage (?)
        del Z
        del dataX
        del dataY
        del interpVal

        zFillDigs = len(str(layers))
        daDict['Layer'+str(lyr).zfill(zFillDigs)] = interp_grid
        del interp_grid
        if verbose:
            print('\t\tCompleted {} interpolation for Layer {}'.format(str(interpType).lower(), str(lyr).zfill(zFillDigs)))

    # Determine whether to export xarray DataArray or Dataset
    dataArrayList = ['dataarray', 'da', 'a', 'array']
    dataSetList = ['dataset', 'ds', 'set']

    interp_data = xr.concat(daDict.values(), dim='Layer')
    interp_data = interp_data.assign_coords(Layer=np.arange(1, layers+1))

    if return_type.lower() in dataArrayList:
        pass
    else:
        if return_type.lower() not in dataSetList and verbose:
            print(f"{return_type} is not a valid input for return_type. Please set return_type to either 'dataarray' or 'dataset'")
            print("Using dataset by default")            
        interp_data = xr.Dataset(data_vars={'Model_Layers': interp_data})
        if verbose:
            print('Done with interpolation, getting additional layers and attributes')

        # Get common attributes from all layers to use as "global" attributes
        common_attrs = {}
        for i, (var_name, data_array) in enumerate(interp_data.data_vars.items()):
            if i == 0:
                common_attrs = data_array.attrs
            else:
                common_attrs = {k: v for k, v in common_attrs.items() if k in data_array.attrs and data_array.attrs[k] == v}
        interp_data.attrs.update(common_attrs)

    if verbose:
        for i, layer_data in enumerate(interp_data):
            pts = points[i]
            pts.plot(c=pts[target_col])

    if export_dir is None:
        pass
    else:
        w4h.export_grids(grid_data=interp_data, out_path=export_dir, file_id='', filetype='tif', variable_sep=True, date_stamp=True)
        print('Exported to {}'.format(export_dir))

    return interp_data



# Function to export the result of thickness of target sediments in each layer

[docs]
def layer_target_thick(gdf, layers=9, well_id_col='API_NUMBER',
                       return_all=False, export_dir=None, outfile_prefix=None,
                       depth_top_col='TOP', depth_bot_col='BOTTOM', log=False,
                       **kwargs):
    
    """Function to calculate thickness of target material
       in each layer at each well point.
       This function loops through each model layer and divides up
       the well intervals from the input GeoDataFrame into 4 categories:
       * 1) Intervals that pierce top of model layer but end within it
       * 2) Intervals contained entirely within model layer
       * 3) Intervals that begin within the model layer but pierce the bottom
       * 4) Intervals that begin above and end below the model layer

       For each category, the amount/thickness of the target material
       within each layer is calculated. These records are then
       "truth-checked" to ensure there are not duplicates and combined.
       The percent thickness (target thickness/layer thickness) is calculated,
       before data is returned and/or exported.

    Parameters
    ----------
    gdf : geopandas.GeoDataFrame
        Geodataframe containing classified data, surface elevation,
        bedrock elevation, layer depths, geometry.
    layers : int, default=9
        Number of layers in model, by default 9
    well_id_col : str, default="API_NUMBER"
        The name of the column that is used for uniquely identifying each well
    return_all : bool, default=False
        If True, return list of original GeoDataFrames with extra column
        added for target thick for each layer.
        If False, return list of geopandas.GeoDataFrames with only
        essential information for each layer.
    export_dir : str or pathlib.Path, default=None
        If str or pathlib.Path, should be directory to which
        to export dataframes built in function.
    outfile_prefix : str, default=None
        Only used if export_dir is set.
        Will be used at the start of the exported filenames.
    depth_top_col : str, default='TOP'
        Name of column containing data for depth
        to top of described well intervals.
    depth_bot_col : str, default='BOTTOM'
        Name of column containing data for depth
        to bottom of described well intervals.
    log : bool, default = True
        Whether to log inputs and outputs to log file.

    Returns
    -------
    res_df and/or res : list
        A list of Geopandas GeoDataFrames containing only important information
        needed for next stage of analysis. If `return_all=True`, the input data
        with the actual descriptions will be returned
        as a separate list of GeoDataFrames.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)
    gdf['TOP_ELEV'] = gdf['SURFACE_ELEV'] - gdf[depth_top_col]
    gdf['BOT_ELEV'] = gdf['SURFACE_ELEV'] - gdf[depth_bot_col]

    layerList = range(1, layers+1)
    res_list = []
    resdf_list = []

    # Generate Column names based on (looped) integers
    for layer in layerList:
        zStr = 'ELEV'
        wellIntTop_col = 'TOP_ELEV'
        wellIntBot_col = 'BOT_ELEV'
        modelLyrTop_Col = zStr+'_LAYER'+str(layer)
        if layer != 9:  # For all layers except the bottom layer....
            modelLyrBot_col = zStr+'_LAYER'+str(layer+1)  # use the layer below
        else:  # Otherwise, ...
            modelLyrBot_col = "BEDROCK_"+zStr  # Use (corrected) brock depth

        # Divide records into 4 categories for ease of calculation:
            # 1: Well interval starts above layer top, ends within model layer
            # 2: Well interval is entirely contained withing model layer
            # 3: Well interval starts in layer, continues through bottom
            # 4: well interval begins/ends above/below (respective) model layer
        if isinstance(gdf[well_id_col], numbers.Number) or str(gdf[well_id_col]).isnumeric():
            gdf[well_id_col] = gdf[well_id_col].astype(int).astype(str)

        # records1 = intervals that go through the top of the layer and bottom is within layer
        records1 = gdf.loc[(gdf[wellIntTop_col] > gdf[modelLyrTop_Col]) &  # Top of the well interval is above or equal to the top of the layer
                           (gdf[wellIntBot_col] <= gdf[modelLyrTop_Col]) &  # Bottom is below the top of the layer
                           (gdf[wellIntBot_col] <= gdf[wellIntTop_col])].copy()  # Bottom is deeper than top (should already be the case)
        records1['TARG_THICK'] = pd.DataFrame(np.round((records1.loc[:, modelLyrTop_Col] - records1.loc[:, wellIntBot_col]) * records1['TARGET'], 3)).copy()

        # records2 = entire interval is within layer
        records2 = gdf.loc[(gdf[wellIntTop_col] <= gdf[modelLyrTop_Col]) &  # Top of the well is lower than top of the layer
                           (gdf[wellIntBot_col] >= gdf[modelLyrBot_col]) &  # Bottom of the well is above bottom of the layer
                           (gdf[wellIntBot_col] <= gdf[wellIntTop_col])].copy()  # Bottom ofthe well is deeper than or equal to top (should already be the case)
        records2['TARG_THICK'] = pd.DataFrame(np.round((records2.loc[:, wellIntTop_col] - records2.loc[:, wellIntBot_col]) * records2['TARGET'], 3)).copy()

        # records3 = intervals with top within layer and bottom of interval going through bottom of layer
        records3 = gdf.loc[(gdf[wellIntTop_col] >= gdf[modelLyrBot_col]) &  # Top of the well is above bottom of layer
                           (gdf[wellIntTop_col] <= gdf[modelLyrTop_Col]) &  # Top of well is below top of layer
                           (gdf[wellIntBot_col] < gdf[modelLyrBot_col]) &  # Bottom of the well is below bottom of layer
                           (gdf[wellIntBot_col] <= gdf[wellIntTop_col])].copy()  # Bottom is deeper than top (should already be the case)
        records3['TARG_THICK'] = pd.DataFrame(np.round((records3.loc[:, wellIntTop_col] - (records3.loc[:, modelLyrBot_col])) * records3['TARGET'], 3)).copy()

        # records4 = interval goes through entire layer
        records4 = gdf.loc[(gdf[wellIntTop_col] > gdf[modelLyrTop_Col]) &  # Top of well is above top of layer
                           (gdf[wellIntBot_col] < gdf[modelLyrBot_col]) &  # Bottom of well is below bottom of layer
                           (gdf[wellIntBot_col] <= gdf[wellIntTop_col])].copy()  # Bottom of well is below top of well
        records4['TARG_THICK'] = pd.DataFrame(np.round((records4.loc[:, modelLyrTop_Col] - records4.loc[:, modelLyrBot_col]) * records4['TARGET'], 3)).copy()

        # Truth check: ensure only well data is not being duplicated
        # Do the "partial" records first (the records that don't go through the entire layer)
        inputRecordList = [records1, records3, records4]
        truthCheckedRecords = []  # initialize output list for merging later
        subSetCols = [well_id_col, 'TOP', "BOTTOM"]
        
        # Go through each set of records, and remove duplicate well intervals
        for recDF in inputRecordList:
            # First sort by target thickness (if duplicates, keep largest)
            recDFSorted = recDF.sort_values(by='TARG_THICK', ascending=False)

            # Drop duplicates and keep largest
            recDFSorted = recDFSorted.drop_duplicates(subset=subSetCols,
                                                      keep='first')
            
            # Re-sort by the well id column for consistency's sake
            recDF = recDFSorted.sort_values(by=well_id_col)

            # Append this category to list for later merging
            truthCheckedRecords.append(recDF)

        # Truth check records category 2, which may have multiples
        # First sort by target thickness (if duplicates, keep largest)
        rec2Sorted = records2.sort_values(by='TARG_THICK', ascending=False)
        
        # Drop duplicates and keep largest
        records2 = rec2Sorted.drop_duplicates(subset=subSetCols,
                                              keep='first')
        
        # Re-sort by the well id column for consistency's sake
        records2 = records2.sort_values(by=well_id_col)

        # Insert this category to list for merging
        truthCheckedRecords.insert(1, records2)

        # Merge four record categories back into single (geo)dataframe
        res = gpd.GeoDataFrame(pd.concat(truthCheckedRecords),
                               geometry='geometry', crs=gdf.crs).sort_values(by=well_id_col)

        # The sign may be reversed if using depth rather than elevation
        if (res['TARG_THICK'] < 0).all():
            res['TARG_THICK'] = res['TARG_THICK'] * -1
        
        # Cannot have negative thicknesses
        res['TARG_THICK'] = res['TARG_THICK'].clip(lower=0)
        res['LAYER_THICK'] = res['LAYER_THICK'].clip(lower=0)

        res.reset_index(drop=True, inplace=True)
        
        # Get geometries for each unique API/well
        # Calculate thickness for each well interval in the layer indicated
        #   (e.g., if there are two well intervals from same well in one model layer)

        # Group by well id column and location, and sum TARG_THICK for each entry
        res_df = res.groupby(by=[well_id_col, 'LATITUDE', 'LONGITUDE'],
                             as_index=False)['TARG_THICK'].sum()

        # Remove the columns so they are not repeated after merge
        mergeRes = res.drop(['LATITUDE', "LONGITUDE", "TARG_THICK"], axis=1)
        
        # Merge the grouped/summed dataframe with the main dataframe
        res_df = pd.merge(left=res_df, right=mergeRes,
                          how='left', on=well_id_col)

        # Calculate thickness as % of total layer thickness (and clip to 1)
        res_df['TARG_THICK_PER'] = pd.DataFrame(np.round(res_df['TARG_THICK']/res_df['LAYER_THICK'], 3)).clip(upper=1)
        
        # Replace np.inf and np.nans with 0 (to ease interpolation)
        res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER'] != np.inf, other=0)
        res_df['TARG_THICK_PER'] = res_df['TARG_THICK_PER'].where(res_df['TARG_THICK_PER'] != np.nan, other=0)

        res_df["LAYER"] = layer  # Include the layer # as a separate gdf column
        res_df = res_df[[well_id_col, 'LATITUDE', 'LONGITUDE', 'LATITUDE_PROJ',
                         'LONGITUDE_PROJ', 'TOP', 'BOTTOM', 'TOP_ELEV',
                         'BOT_ELEV', 'SURFACE_ELEV', modelLyrTop_Col,
                         modelLyrBot_col, 'LAYER_THICK',
                         'TARG_THICK', 'TARG_THICK_PER', 'LAYER',
                         'geometry']].copy()
        
        # Convert back to GeoDataFrame
        res_df = gpd.GeoDataFrame(res_df,
                                  geometry='geometry',
                                  crs=gdf.crs)

        # Add layer outputs to list to be returned
        resdf_list.append(res_df)
        res_list.append(res)

        # Export data if specified
        if isinstance(export_dir, (pathlib.PurePath, str, os.PathLike)):
            export_dir = pathlib.Path(export_dir)
            if export_dir.is_dir():
                pass
            else:
                try:
                    os.mkdir(export_dir)
                except Exception:
                    print('Specified export directory does not exist and cannot be created. Function will continue to run, but data will not be exported.')

            # Format and build export filepath
            zFillDigs = len(str(len(layerList)))
            if str(outfile_prefix).endswith('_'):
                outfile_prefix = outfile_prefix[:-1]
            elif outfile_prefix is None:
                outfile_prefix = ''

            nowStr = str(datetime.datetime.today().date())+'_'+str(datetime.datetime.today().hour)+'-'+str(datetime.datetime.today().minute)+'-'+str(datetime.datetime.today().second)
            outPath = export_dir.joinpath(outfile_prefix+'_Lyr'+str(layer).zfill(zFillDigs)+'_'+nowStr+'.csv')
            outPathStr = outPath.as_posix()
            
            if return_all:
                res.to_csv(outPathStr, sep=',',
                           na_rep=np.nan, index_label='ID')
                res_df.to_csv(outPathStr, sep=',',
                              na_rep=np.nan, index_label='ID')
            else:
                res_df.to_csv(outPathStr, sep=',',
                              na_rep=np.nan, index_label='ID')

    if return_all:
        return res_list, resdf_list
    else:
        return resdf_list



# Implementatino of natural neighbor interpolation

[docs]
def natural_neighbor_interp(points, model_grid=None, parallelize=False,
                            well_id='API_NUMBER', interp_value='TARG_THICK_PER',
                            xcoord='LONGITUDE_PROJ', ycoord='LATITUDE_PROJ', elev="SURFACE_ELEV",
                            point_crs='EPSG:5070', model_crs='EPSG:5070', output_crs='EPSG:5070',
                            show_points=False, sparsity_factor=10,
                            show_plot=False, show_adjacent_regions=False, show_voronoi=False,
                            time_segments=False, verbose=False):
    """Implementation of natural neighbor algorithm, with sparse interpolation.
    This interpolation algorithm takes a very long time.

    Parameters
    ----------
    points : {pd.DataFrame, gpd.GeoDataFrame}
        DataFrame with point data
    model_grid : {xr.DataArray, xr.Dataset}, optional
        xarray object with coordinates at grid for interpolation, by default None
    parallelize : bool, optional
        Whether to use parallelization. Currently in beta/not supported, by default False
    well_id : str, optional
        String with column name of Well ID, by default 'API_NUMBER'
    interp_value : str, optional
        Column name in points df to use for interpolation, by default 'ELEVATION'
    xcoord : str, optional
        X coordinate column name, by default 'LONGITUDE'
    ycoord : str, optional
        Y coordinate column name, by default 'LATITUDE'
    elev : str, optional
        Elevation column name, by default "ELEVATION"
    show_points : bool, optional
        Whether to show well points, by default False
    sparsity_factor : int, optional
        Factor by which to "decimate" the coordinates for Natural Neighbor analysis.
        These points are then interpolated linearly to the model grid.
        Using sparse coordinates saves computation time, by default 10
    show_plot : bool, optional
        Whether to show a plot of nearby points of interest for each point, 
        and overlap of new Voronoi and old, by default False
    show_adjacent_regions : bool, optional
        Whether to show the plot of the adjacent regions for each point, by default False
    show_voronoi : bool, optional
        Whether to show the voronoi polygons, by default False
    time_segments : bool, optional
        Whether to time each segment of the analysis, by default False
    verbose : bool, optional
        Whether to print additional results or parameters to the terminal

    Returns
    -------
    _type_
        _description_
    """
    time0 = datetime.datetime.now()

    # Get model grid in standardized format/crs
    if pathlib.Path(str(model_grid)).exists() and pathlib.Path(str(model_grid)).is_file():
        model_grid_IN = rxr.open_rasterio(model_grid)
    else:
        model_grid_IN = model_grid
    model_grid_IN = model_grid_IN.rio.reproject(output_crs)

    original_coords = model_grid_IN.coords
    if sparsity_factor is not None:
        if isinstance(model_grid_IN, xr.Dataset):
            interp_grid = model_grid_IN[:, ::sparsity_factor, ::sparsity_factor]
        else:
            interp_grid = model_grid_IN[::sparsity_factor, ::sparsity_factor]
    else:
        interp_grid = model_grid_IN

    uniqueWellDF = points.drop_duplicates(subset=well_id)[[well_id, xcoord, ycoord, elev, interp_value, 'geometry']].reset_index(drop=True)

    if not isinstance(uniqueWellDF, gpd.GeoDataFrame):
        uniqueWellDF['geometry'] = uniqueWellDF['geometry'].apply(wkt.loads)
        uniqueWellDF = gpd.GeoDataFrame(uniqueWellDF, geometry='geometry', crs=point_crs)
    uniqueWellDF = uniqueWellDF.to_crs(output_crs)

    minx, miny, maxx, maxy = uniqueWellDF.total_bounds
    interp_grid = interp_grid.rio.clip_box(minx=minx, miny=miny, maxx=maxx, maxy=maxy)
    
    wellPtList = []
    wellValueList = []
    for i, well in uniqueWellDF.iterrows():
        wellPtList.append([well.geometry.x, well.geometry.y])
        wellValueList.append(float(well[interp_value]))
    wellPtArr = np.array(wellPtList)

    if show_points:
        uniqueWellDF.plot(c=uniqueWellDF[interp_value], vmin=400, vmax=600)#xcoord, ycoord, kind='scatter')
        plt.show()

    wellPtList = []
    wellValueList = []
    for i, well in uniqueWellDF.iterrows():
        wellPtList.append([well[xcoord], well[ycoord]])
        wellValueList.append(float(well[interp_value]))

    vClass = Voronoi(points=wellPtArr)
    if time_segments:
        thisTime = lastTime = datetime.datetime.now()
        print(f"Voronoi time:  {thisTime-time0}")

    if show_voronoi:
        voronoi_plot_2d(vClass)
        plt.show()

    # Get vertices as variable
    verts = vClass.vertices

    # Iterate through all coordinates for interpolation
    # CODE HERE FOR ITERATING THROUGH COORDS?
    #newPoint = Point([-90.1, 38.7])
    
    def _vectorized_nat_neighbor(x, y, time_segments=time_segments, verbose=False):
        newPoint = Point([x, y])
        lastTime = datetime.datetime.now()
        
        # Find region containing the "newPoint" (point for interpolation)
        polygonList = []
        polyRegInd = []
        containingRegion = None
        for i, region in enumerate(vClass.regions):
            # Create (valid) shapely polygon for each region using vertices
            vertList = [verts[v].tolist() for v in region if v != -1]
            if len(vertList) > 0 and vertList[0] != vertList[-1]:
                vertList.append(vertList[0])

            if len(vertList) == 0:
                continue
            currPoly = Polygon(vertList)

            # If polygon contains the point, record and break loop
            if currPoly.is_valid and currPoly.contains(newPoint) and containingRegion is None:
                containingRegion = currPoly
                containingRegionIndex = i
                break
            polygonList.append(currPoly)
            polyRegInd.append(i)

        if containingRegion is None:
            if time_segments:
                thisTime = datetime.datetime.now()
                print(f"Polygon is invalid or no overlapping polygons with point, returning nan:  {thisTime-lastTime}")
                lastTime = datetime.datetime.now()                
            return np.nan

        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Make Initial Polygons time:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        # Get all regions that touch containing region
        # UPDATE THIS WITH GEOPANDAS FOR EFFICIENCY!!!!!!
        regionAndAdjacents = [containingRegion]
        regionIndices = [containingRegionIndex]
        for i, region in enumerate(polygonList):
            touchCondition = containingRegion.touches(region)

            if touchCondition:
                regionAndAdjacents.append(region)
                regionIndices.append(polyRegInd[i])

        # Now, get second layer of regions
        # UPDATE THIS WITH GEOPANDAS FOR EFFICIENCY!!!!!!
        regionAndAdjacents2 = regionAndAdjacents.copy()
        for i, region in enumerate(regionAndAdjacents2):
            for j, regInd in enumerate(vClass.point_region):
                regionVerts = [verts[v].tolist() for v in vClass.regions[regInd]]
                currPoly = Polygon(regionVerts)
                if region.touches(currPoly) and currPoly.is_valid and regInd not in regionIndices:
                    regionAndAdjacents.append(currPoly)
                    regionIndices.append(int(regInd))

        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Get all adjacent and semi-adjacent Polygons:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        # Get points and values for interpolation
        nearPoints = [newPoint.coords[0]]
        nearValues = [None]
        vPR = list(vClass.point_region)
        for ind in regionIndices:
            nearValues.append(wellValueList[vPR.index(ind)])
            nearPoints.append(vClass.points[vPR.index(ind)].tolist())
        regGS = gpd.GeoDataFrame(zip(regionIndices, regionAndAdjacents), columns=['RegionIndices', 'geometry'], crs=output_crs)

        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Extract points for interpolation:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        # Plot regions and points
        if show_adjacent_regions:
            x, y = zip(*nearPoints)
            fig, ax = plt.subplots(figsize=(10,10))
            ax.scatter(x, y, c=nearValues)

            regGS.plot(ax=ax, facecolor='#00000000', edgecolor='k')
            ax.scatter(x=newPoint.xy[0], y=newPoint.xy[1], c='k', marker='+')
            [ax.text(x=x[i], y=y[i], s=f"{nv:.0f}") for i, nv in enumerate(nearValues) if i!=0]
            plt.show()
            if time_segments:
                thisTime = datetime.datetime.now()
                print(f"Plot adjacent regions:  {thisTime-lastTime}")
                lastTime = datetime.datetime.now()

        # Create new voronoi only with points with regions nearby to new point (to reduce computational cost)
        nearbyVor = Voronoi(points=nearPoints)
        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Generate near voronoi:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        # Go through just the nearby regions, check validitiy, and get info
        # (Might be able to just access with indices and save time?)
        polygons = []
        for i, region in enumerate(nearbyVor.regions):
            # Create (valid) shapely polygon for each region using vertices
            vertList = [verts[v].tolist() for v in region]

            if len(vertList)>0 and vertList[0] != vertList[-1]:
                vertList.append(vertList[0])

            if len(vertList) < 4:
                continue
            
            currPoly = Polygon(vertList)

            # If polygon contains the point, record and break loop
            if currPoly.is_valid and currPoly.contains(newPoint) and containingRegion is False:
                containingRegion = currPoly
                containingRegionIndex = i
            polygons.append(currPoly)
            polyRegInd.append(i)

        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Get nearby polygons:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        # Get the "New" polygon (the one created for the interpolation point of interest)
        try:
            polyOfInterest = Polygon([nearbyVor.vertices[v] for v in nearbyVor.regions[nearbyVor.point_region[0]]])
        except:
            return np.nan
        
        if not polyOfInterest.is_valid:
            #print('Polygon of interest is not valid')
            return np.nan

        # Plot regions nearby point of interest, and overlap of new Voronoi and old
        if show_plot:
            fig3, ax3 = plt.subplots()
            regGS.plot(facecolor="#00000000", edgecolor='k', ax=ax3)
            x, y = zip(*nearPoints)
            ax3.scatter(x, y)
            gpd.GeoSeries([polyOfInterest], crs=output_crs).plot(facecolor="#00000000", 
                                                        edgecolor='r', linewidths=5, ax=ax3)
            if time_segments:
                thisTime = datetime.datetime.now()
                print(f"Plot nearby polys and overlap:  {thisTime-lastTime}")
                lastTime = datetime.datetime.now()
            plt.show()
            
        # Check which regions overlap and their areas
        intersectionAreas = []
        intersectionValues = []
        for row, reg in regGS.iterrows():
            if reg['geometry'].intersects(polyOfInterest):
                intersectionAreas.append(polyOfInterest.intersection(reg['geometry']).area)
                intersectionValues.append(float(wellValueList[vPR.index(reg['RegionIndices'])]))            

        # Calculate weights
        weights = np.array(intersectionAreas)/np.nansum(intersectionAreas)
        if time_segments:
            thisTime = datetime.datetime.now()
            print(f"Calculate weights:  {thisTime-lastTime}")
            lastTime = datetime.datetime.now()

        res = float(np.nansum(weights * intersectionValues))
        # Calculate and return natural neighbor interpolated value
        return res

    if parallelize:
        x = interp_grid["x"].values
        y = interp_grid["y"].values

        da_x = xr.DataArray(x, dims="x", coords={"x": x})
        da_y = xr.DataArray(y, dims="y", coords={"y": y})

        # Broadcast to full grid
        X, Y = xr.broadcast(da_x, da_y)

        # Apply function vectorized over coordinates
        result_vec = xr.apply_ufunc(
            _vectorized_nat_neighbor, X, Y,
            vectorize=True,
            #dask="parallelized", # optional if using dask
            output_dtypes=[float]        
            )

        print(interp_grid)
        print(result_vec)

        return xr.DataArray(result_vec.T,
                            coords={"y": interp_grid.y, "x": interp_grid.x},
                            dims=("y", "x"))
    else:
        print("Starting interpolation at all grid points: ", len(interp_grid['x'].values), 'x', len(interp_grid['y'].values), len(interp_grid['y'].values)*len(interp_grid['x'].values))
        resultList = []
        dataPoints = []
        for x in interp_grid['x'].values:
            innerList = []
            for y in interp_grid['y'].values:
                interpVal = _vectorized_nat_neighbor(x, y, time_segments=time_segments, verbose=verbose)
                innerList.append(interpVal)
                dataPoints.append((x, y))
            resultList.extend(innerList) #???????

        if verbose:
            print('\t\t Natural neighbor completed, reinterpolating to model grid')
            print("\t\tFinal Size of interpolation:", np.array(resultList).shape)

        interp = interpolate.NearestNDInterpolator(dataPoints, resultList)

        X, Y = np.meshgrid(original_coords['x'], original_coords['y'], sparse=True)  # 2D Grid for interpolation
        Z = interp(X, Y)

        return xr.DataArray(Z,
                            coords={"y": original_coords['y'], "x": original_coords['x']},
                            dims=("y", "x"))


# Optional, combine dataset

[docs]
def combine_dataset(layer_dataset, surface_elev, bedrock_elev, layer_thick, log=False):
    """Function to combine xarray datasets or datarrays into
    a single xarray.Dataset.
    Useful to add surface, bedrock, layer thick, and layer datasets
    all into one variable, for pickling or netcdf export, for example.

    Parameters
    ----------
    layer_dataset : xr.DataArray
        DataArray contining all the interpolated layer information.
    surface_elev : xr.DataArray
        DataArray containing surface elevation data
    bedrock_elev : xr.DataArray
        DataArray containing bedrock elevation data
    layer_thick : xr.DataArray
        DataArray containing layer thickness at each point in the model grid
    log : bool, default = False
        Whether to log inputs and outputs to log file.

    Returns
    -------
    xarray.Dataset or xarray.DataArray
        Dataset with all arrays set to different variables within dataset.
        Or simply DataArray, if specified.
    """
    logger_function(log, locals(), inspect.currentframe().f_code.co_name)

    daDict = {}
    daDict['Layers'] = layer_dataset
    daDict['Surface_Elev'] = surface_elev
    daDict['Bedrock_Elev'] = bedrock_elev
    daDict['Layer_Thickness'] = layer_thick

    combined_dataset = xr.Dataset(daDict)

    return combined_dataset
Source code for w4h.layers

w4h

Navigation

Related Topics