"""The Clean module contains functions for cleaning the data (i.e., removing data not to be used in further analysis)
"""
import inspect
import numpy as np
import pandas as pd
from w4h import logger_function, verbose_print
# This function drops all records in downholeData with bad depth information (where the bottom of a record is nearer to the surface than the top)
[docs]
def remove_bad_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', depth_type='depth', verbose=False, log=False):
"""Function to remove all records in the dataframe with well interpretations where the depth information is bad (i.e., where the bottom of the record is neerer to the surface than the top)
Parameters
----------
df_with_depth : pandas.DataFrame
Pandas dataframe containing the well records and descriptions for each interval
top_col : str, default='TOP'
The name of the column containing the depth or elevation for the top of the interval, by default 'TOP'
bottom_col : str, default='BOTTOM'
The name of the column containing the depth or elevation for the bottom of each interval, by default 'BOTTOM'
depth_type : str, {'depth', 'elevation'}
Whether the table is organized by depth or elevation. If depth, the top column will have smaller values than the bottom column. If elevation, the top column will have higher values than the bottom column, by default 'depth'
verbose : bool, default = False
Whether to print results to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False
Returns
-------
pandas.Dataframe
Pandas dataframe with the records remvoed where the top is indicatd to be below the bottom.
"""
logger_function(log, locals(), inspect.currentframe().f_code.co_name)
if verbose:
verbose_print(remove_bad_depth, locals(), exclude_params=['df_with_depth'])
if depth_type.lower() =='depth':
df_with_depth['THICKNESS'] = df_with_depth[bottom_col] - df_with_depth[top_col] #Calculate interval thickness
elif depth_type.lower() =='elevation' or depth_type=='elev':
df_with_depth['THICKNESS'] = df_with_depth[top_col] - df_with_depth[bottom_col] #Calculate interval thickness
before = df_with_depth.shape[0] #Calculate number of rows before dropping
df_with_depth = df_with_depth[(df_with_depth['THICKNESS'] >= 0)] #Only include rows where interval thickness is positive (bottom is deeper than top)
df_with_depth = df_with_depth.reset_index(drop=True) #Reset index
if verbose:
after = df_with_depth.shape[0]
print('\tRemoved well records with obviously bad depth information. ')
print("\t\tNumber of records before removing: "+str(before))
print("\t\tNumber of records after removing: "+str(after))
print(f"\t{before-after} well records removed without depth information")
return df_with_depth
# This function drops all records in the downholedata with no depth information (either top or bottom depth of well interval)
[docs]
def remove_no_depth(df_with_depth, top_col='TOP', bottom_col='BOTTOM', no_data_val_table='', verbose=False, log=False):
"""Function to remove well intervals with no depth information
Parameters
----------
df_with_depth : pandas.DataFrame
Dataframe containing well descriptions
top_col : str, optional
Name of column containing information on the top of the well intervals, by default 'TOP'
bottom_col : str, optional
Name of column containing information on the bottom of the well intervals, by default 'BOTTOM'
no_data_val_table : any, optional
No data value in the input data, used by this function to indicate that depth data is not there, to be replaced by np.nan, by default ''
verbose : bool, optional
Whether to print results to console, by default False
log : bool, default = False
Whether to log results to log file, by default False
Returns
-------
df_with_depth : pandas.DataFrame
Dataframe with depths dropped
"""
logger_function(log, locals(), inspect.currentframe().f_code.co_name)
if verbose:
verbose_print(remove_no_depth, locals(), exclude_params=['df_with_depth'])
#Replace empty cells in top and bottom columns with nan
df_with_depth[top_col] = df_with_depth[top_col].replace(no_data_val_table, np.nan)
df_with_depth[bottom_col] = df_with_depth[bottom_col].replace(no_data_val_table, np.nan)
#Calculate number of rows before dropping
before = df_with_depth.shape[0]
#Drop records without depth information
df_with_depth = df_with_depth.dropna(subset=[top_col])
df_with_depth = df_with_depth.dropna(subset=[bottom_col])
df_with_depth = df_with_depth.reset_index(drop=True) #Reset index
if verbose:
after = df_with_depth.shape[0]
print('\tRemoved well records with no depth information. ')
print("\t\tNumber of records before removing: "+str(before))
print("\t\tNumber of records after removing: "+str(after))
print(f"\t{before-after} well records removed without depth information")
return df_with_depth
# This function drops all records in downholeData with no formation in formation in the description fiel
[docs]
def remove_no_description(df_with_descriptions, description_col='FORMATION', no_data_val_table='', verbose=False, log=False):
"""Function that removes all records in the dataframe containing the well descriptions where no description is given.
Parameters
----------
df_with_descriptions : pandas.DataFrame
Pandas dataframe containing the well records with their individual descriptions
description_col : str, optional
Name of the column containing the geologic description of each interval, by default 'FORMATION'
no_data_val_table : str, optional
The value expected if the column is empty or there is no data. These will be replaced by np.nan before being removed, by default ''
verbose : bool, optional
Whether to print the results of this step to the terminal, by default False
log : bool, default = False
Whether to log results to log file, by default False
Returns
-------
pandas.DataFrame
Pandas dataframe with records with no description removed.
"""
logger_function(log, locals(), inspect.currentframe().f_code.co_name)
if verbose:
verbose_print(remove_no_description, locals(), exclude_params=['df_with_descriptions'])
#Replace empty cells in formation column with nans
df_with_descriptions[description_col] = df_with_descriptions[description_col].replace(no_data_val_table, np.nan)
before = df_with_descriptions.shape[0] #Calculate number of rows before dropping
#Drop records without FORMATION information
df_with_descriptions = df_with_descriptions.dropna(subset=[description_col])
df_with_descriptions = df_with_descriptions.reset_index(drop=True) #Reset index
if verbose:
after = df_with_descriptions.shape[0]
print('\tRemoved well records without geologic descriptions. ')
print("\t\tNumber of records before removing: "+str(before))
print("\t\tNumber of records after removing: "+str(after))
print(f"\t{before-after} well records removed without geologic descriptions")
return df_with_descriptions
# Function to remove data (intended for headerData) without surface topography information
# THIS ASSUMES AND SHOULD ONLY BE RUN AFTER ALL DESIRED SURFACE TOPO DATASETS HAVE BEEN MERGED/ADDED
[docs]
def remove_no_topo(df_with_topo, zcol='SURFACE_ELEV', no_data_val_table='', verbose=False, log=False):
"""Function to remove wells that do not have topography data (needed for layer selection later).
This function is intended to be run on the metadata table after elevations have attempted to been added.
Parameters
----------
df_with_topo : pandas.DataFrame
Pandas dataframe containing elevation information.
zcol : str
Name of elevation column
no_data_val_table : any
Value in dataset that indicates no data is present (replaced with np.nan)
verbose : bool, optional
Whether to print outputs, by default True
log : bool, default = False
Whether to log results to log file, by default False
Returns
-------
pandas.DataFrame
Pandas dataframe with intervals with no topography removed.
"""
logger_function(log, locals(), inspect.currentframe().f_code.co_name)
if verbose:
verbose_print(remove_no_topo, locals(), exclude_params=['df_with_topo'])
before = df_with_topo.shape[0]
df_with_topo[zcol] = df_with_topo[zcol].replace(no_data_val_table, np.nan)
df_with_topo = df_with_topo.dropna(subset=[zcol])
if verbose:
after = df_with_topo.shape[0]
print('\tRemoved well records with no surface elevation information. ')
print("\t\tNumber of records before removing: "+str(before))
print("\t\tNumber of records after removing: "+str(after))
print(f"\t{before-after} wells records removed without surface elevation information")
return df_with_topo
# This function removes all data from the downholeData table where there is no location information (in the headerData table). This includes elevation info too
[docs]
def remove_nonlocated(df_with_locations, xcol='LONGITUDE', ycol='LATITUDE', no_data_val_table='', verbose=False, log=False):
"""Function to remove wells and well intervals where there is no location information
Parameters
----------
df_with_locations : pandas.DataFrame
Pandas dataframe containing well descriptions
metadata_DF : pandas.DataFrame
Pandas dataframe containing metadata, including well locations (e.g., Latitude/Longitude)
log : bool, default = False
Whether to log results to log file, by default False
Returns
-------
df_with_locations : pandas.DataFrame
Pandas dataframe containing only data with location information
"""
logger_function(log, locals(), inspect.currentframe().f_code.co_name)
if verbose:
verbose_print(remove_nonlocated, locals(), exclude_params=['df_with_locations'])
before = df_with_locations.shape[0] # Extract length of data
df_with_locations[xcol] = df_with_locations[xcol].replace(no_data_val_table, np.nan)
df_with_locations[ycol] = df_with_locations[ycol].replace(no_data_val_table, np.nan)
df_with_locations = df_with_locations.dropna(subset=xcol)
df_with_locations = df_with_locations.dropna(subset=ycol)
if verbose:
after = df_with_locations.shape[0]
print('\tRemoved well records with no location information. ')
print("\t\tNumber of records before removing: "+str(before))
print("\t\tNumber of records after removing: "+str(after))
print("\t{} wells records removed without location information".format(before-after))
return df_with_locations