Source code for w4h.clean

"""The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)
"""

import inspect

import numpy as np
import pandas as pd

from w4h import logger_function, verbose_print


# This function drops all records in downholeData with bad depth information (where the bottom of a record is nearer to the surface than the top)
[docs] def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False): """Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top) Parameters ---------- df_with_depth : pandas.DataFrame Pandas dataframe containing the well records and descriptions for each interval top_col : str, default='TOP' The name of the column containing the depth or elevation for the top of the interval, by default 'TOP' bottom_col : str, default='BOTTOM' The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM' depth_type : str, {'depth', 'elevation'} Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth' verbose : bool, default = False Whether to print results to the terminal, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.Dataframe Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth']) if depth_type.lower() =='depth': df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness elif depth_type.lower() =='elevation' or depth_type=='elev': df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness before = df_with_depth.shape[0] #Calculate number of rows before dropping df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top) df_with_depth = df_with_depth.reset_index(drop=True) #Reset index if verbose: after = df_with_depth.shape[0] print('\tRemoved well records with obviously bad depth information. ') print("\t\tNumber of records before removing: "+str(before)) print("\t\tNumber of records after removing: "+str(after)) print(f"\t{before-after} well records removed without depth information") return df_with_depth
# This function drops all records in the downholedata with no depth information (either top or bottom depth of well interval)
[docs] def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False): """Function to remove well intervals with no depth information Parameters ---------- df_with_depth : pandas.DataFrame Dataframe containing well descriptions top_col : str, optional Name of column containing information on the top of the well intervals, by default 'TOP' bottom_col : str, optional Name of column containing information on the bottom of the well intervals, by default 'BOTTOM' no_data_val_table : any, optional No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default '' verbose : bool, optional Whether to print results to console, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- df_with_depth : pandas.DataFrame Dataframe with depths dropped """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth']) #Replace empty cells in top and bottom columns with nan df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan) df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan) #Calculate number of rows before dropping before = df_with_depth.shape[0] #Drop records without depth information df_with_depth = df_with_depth.dropna(subset=[top_col]) df_with_depth = df_with_depth.dropna(subset=[bottom_col]) df_with_depth = df_with_depth.reset_index(drop=True) #Reset index if verbose: after = df_with_depth.shape[0] print('\tRemoved well records with no depth information. ') print("\t\tNumber of records before removing: "+str(before)) print("\t\tNumber of records after removing: "+str(after)) print(f"\t{before-after} well records removed without depth information") return df_with_depth
# This function drops all records in downholeData with no formation in formation in the description fiel
[docs] def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False): """Function that removes all records in the dataframe containing the well descriptions where no description is given. Parameters ---------- df_with_descriptions : pandas.DataFrame Pandas dataframe containing the well records with their individual descriptions description_col : str, optional Name of the column containing the geologic description of each interval, by default 'FORMATION' no_data_val_table : str, optional The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default '' verbose : bool, optional Whether to print the results of this step to the terminal, by default False log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.DataFrame Pandas dataframe with records with no description removed. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions']) #Replace empty cells in formation column with nans df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan) before = df_with_descriptions.shape[0] #Calculate number of rows before dropping #Drop records without FORMATION information df_with_descriptions = df_with_descriptions.dropna(subset=[description_col]) df_with_descriptions = df_with_descriptions.reset_index(drop=True) #Reset index if verbose: after = df_with_descriptions.shape[0] print('\tRemoved well records without geologic descriptions. ') print("\t\tNumber of records before removing: "+str(before)) print("\t\tNumber of records after removing: "+str(after)) print(f"\t{before-after} well records removed without geologic descriptions") return df_with_descriptions
# Function to remove data (intended for headerData) without surface topography information # THIS ASSUMES AND SHOULD ONLY BE RUN AFTER ALL DESIRED SURFACE TOPO DATASETS HAVE BEEN MERGED/ADDED
[docs] def remove_no_topo(df_with_topo, zcol='SURFACE_ELEV', no_data_val_table='', verbose=False, log=False): """Function to remove wells that do not have topography data (needed for layer selection later). This function is intended to be run on the metadata table after elevations have attempted to been added. Parameters ---------- df_with_topo : pandas.DataFrame Pandas dataframe containing elevation information. zcol : str Name of elevation column no_data_val_table : any Value in dataset that indicates no data is present (replaced with np.nan) verbose : bool, optional Whether to print outputs, by default True log : bool, default = False Whether to log results to log file, by default False Returns ------- pandas.DataFrame Pandas dataframe with intervals with no topography removed. """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo']) before = df_with_topo.shape[0] df_with_topo[zcol] = df_with_topo[zcol].replace(no_data_val_table, np.nan) df_with_topo = df_with_topo.dropna(subset=[zcol]) if verbose: after = df_with_topo.shape[0] print('\tRemoved well records with no surface elevation information. ') print("\t\tNumber of records before removing: "+str(before)) print("\t\tNumber of records after removing: "+str(after)) print(f"\t{before-after} wells records removed without surface elevation information") return df_with_topo
# This function removes all data from the downholeData table where there is no location information (in the headerData table). This includes elevation info too
[docs] def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False): """Function to remove wells and well intervals where there is no location information Parameters ---------- df_with_locations : pandas.DataFrame Pandas dataframe containing well descriptions metadata_DF : pandas.DataFrame Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude) log : bool, default = False Whether to log results to log file, by default False Returns ------- df_with_locations : pandas.DataFrame Pandas dataframe containing only data with location information """ logger_function(log, locals(), inspect.currentframe().f_code.co_name) if verbose: verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations']) before = df_with_locations.shape[0] # Extract length of data df_with_locations[xcol] = df_with_locations[xcol].replace(no_data_val_table, np.nan) df_with_locations[ycol] = df_with_locations[ycol].replace(no_data_val_table, np.nan) df_with_locations = df_with_locations.dropna(subset=xcol) df_with_locations = df_with_locations.dropna(subset=ycol) if verbose: after = df_with_locations.shape[0] print('\tRemoved well records with no location information. ') print("\t\tNumber of records before removing: "+str(before)) print("\t\tNumber of records after removing: "+str(after)) print("\t{} wells records removed without location information".format(before-after)) return df_with_locations