Source code for bulwark.checks

# -*- coding: utf-8 -*-
"""
Each function in this module should:

- take a pd.DataFrame as its first argument, with optional additional arguments,
- make an assert about the pd.DataFrame, and
- return the original, unaltered pd.DataFrame

"""
import operator
import warnings

import numpy as np
import pandas as pd
import pandas.testing as tm

from bulwark.generic import bad_locations

# Required for DeprecationWarnings to not be ignored
warnings.simplefilter('always', DeprecationWarning)


[docs]def has_columns(df, columns, exact_cols=False, exact_order=False): """Asserts that `df` has ``columns`` Args: df (pd.DataFrame): Any pd.DataFrame. columns (list or tuple): Columns that are expected to be in ``df``. exact_cols (bool): Whether or not ``columns`` need to be the only columns in ``df``. exact_order (bool): Whether or not ``columns`` need to be in the same order as the columns in ``df``. Returns: Original `df`. """ df_cols = df.columns msg = [] missing_cols = list(set(columns).difference(df_cols)) if missing_cols: msg.append("`df` is missing columns: {}.".format(missing_cols)) if exact_cols: unexpected_extra_cols = list(set(df_cols).difference(columns)) if unexpected_extra_cols: msg.append("`df` has extra columns: {}.".format(unexpected_extra_cols)) if exact_order: if missing_cols: msg.append("`df` column order does not match given `columns` order, " "because columns are missing.") else: # idx_order = [columns.index(df.columns[i]) for i in range(len(columns))] idx_order = [] for i in range(len(columns)): try: idx_order.append(columns.index(df.columns[i])) except ValueError: pass if idx_order != sorted(idx_order): msg.append("`df` column order does not match given `columns` order.") if msg: raise AssertionError(" ".join(msg)) return df
[docs]def has_no_x(df, values=None, columns=None): """Asserts that there are no user-specified `values` in `df`'s `columns`. Args: df (pd.DataFrame): Any pd.DataFrame. values (list): A list of values to check for in the pd.DataFrame. columns (list): A subset of columns to check for `values`. Returns: Original `df`. """ values = values if values is not None else [] columns = columns if columns is not None else df.columns try: assert not df[columns].isin(values).values.any() except AssertionError as e: missing = df[columns].isin(values) msg = bad_locations(missing) e.args = msg raise return df
[docs]def none_missing(df, columns=None): """Deprecated: Replaced with has_no_nans""" warnings.warn("This function has been renamed to has_no_nans. " "The old name will be removed in 0.7.", DeprecationWarning, stacklevel=1) return has_no_nans(df, columns)
[docs]def has_no_nans(df, columns=None): """Asserts that there are no np.nans in `df`. This is a convenience wrapper for `has_no_x`. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for np.nans. Returns: Original `df`. """ return has_no_x(df, values=[np.nan], columns=columns)
[docs]def has_no_nones(df, columns=None): """Asserts that there are no Nones in `df`. This is a convenience wrapper for `has_no_x`. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for Nones. Returns: Original `df`. """ return has_no_x(df, values=[None], columns=columns)
[docs]def has_no_infs(df, columns=None): """Asserts that there are no np.infs in `df`. This is a convenience wrapper for `has_no_x`. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for np.infs. Returns: Original `df`. """ return has_no_x(df, values=[np.inf], columns=columns)
[docs]def has_no_neg_infs(df, columns=None): """Asserts that there are no np.infs in `df`. This is a convenience wrapper for `has_no_x`. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for -np.infs. Returns: Original `df`. """ return has_no_x(df, values=[-np.inf], columns=columns)
[docs]def has_set_within_vals(df, items): """Asserts that all given values are found in columns' values. In other words, the given values in the `items` dict should all be a subset of the values found in the associated column in `df`. Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns to values excepted to be found within them. Returns: Original `df`. Examples: The following check will pass, since df['a'] contains each of 1 and 2: >>> import bulwark.checks as ck >>> import pandas as pd >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) >>> ck.has_set_within_vals(df, items={"a": [1, 2]}) a b 0 1 a 1 2 b 2 3 c The following check will fail, since df['b'] doesn't contain each of "a" and "d": >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) >>> ck.has_set_within_vals(df, items={"a": [1, 2], "b": ["a", "d"]}) Traceback (most recent call last): ... AssertionError: The following column: value pairs are missing: {'b': ['d']} """ bad_cols_vals = {} for col, vals in items.items(): missing_vals = np.setdiff1d(vals, df[col].unique(), assume_unique=True).tolist() if missing_vals: bad_cols_vals.update({col: missing_vals}) if bad_cols_vals: raise AssertionError("The following column: value pairs are missing: {}" .format(bad_cols_vals)) return df
[docs]def unique_index(df): """Deprecated: Replaced with has_unique_index""" warnings.warn("This function has been renamed to hasunique_index. " "The old name will be removed in 0.7.", DeprecationWarning, stacklevel=1) return has_unique_index(df)
[docs]def has_unique_index(df): """Asserts that `df`'s index is unique. Args: df (pd.DataFrame): Any pd.DataFrame. Returns: Original `df`. """ try: assert df.index.is_unique except AssertionError as e: e.args = df.index[df.index.duplicated()].unique() raise return df
[docs]def is_monotonic(df, items=None, increasing=None, strict=False): """Asserts that the `df` is monotonic. Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns to conditions (increasing, strict) E.g. {'col_a': (None, False), 'col_b': (None, False)} increasing (bool, None): None checks for either increasing or decreasing monotonicity. strict (bool): Whether the comparison should be strict, meaning two values in a row being equal should fail. Returns: Original `df`. Examples: The following check will pass, since each column matches its monotonicity requirements: >>> import bulwark.checks as ck >>> import pandas as pd >>> df = pd.DataFrame({"incr_strict": [1, 2, 3, 4], ... "incr_not_strict": [1, 2, 2, 3], ... "decr_strict": [4, 3, 2, 1], ... "decr_not_strict": [3, 2, 2, 1]}) >>> items = { ... "incr_strict": (True, True), ... "incr_not_strict": (True, False), ... "decr_strict": (False, True), ... "decr_not_strict": (False, False) ... } >>> ck.is_monotonic(df, items=items) incr_strict incr_not_strict decr_strict decr_not_strict 0 1 1 4 3 1 2 2 3 2 2 3 2 2 2 3 4 3 1 1 All of the same cases will also pass if increasing=None, since only one of increasing or decreasing monotonicity is then required: >>> ck.is_monotonic(df, increasing=None, strict=False) incr_strict incr_not_strict decr_strict decr_not_strict 0 1 1 4 3 1 2 2 3 2 2 3 2 2 2 3 4 3 1 1 The following check will fail, displaying a list of which (row, column)s caused the issue: >>> df2 = pd.DataFrame({'not_monotonic': [1, 2, 3, 2]}) >>> ck.is_monotonic(df2, increasing=True, strict=False) Traceback (most recent call last): ... AssertionError: [(3, 'not_monotonic')] """ if items is None: items = {col: (increasing, strict) for col in df} operator_choices = { # key = (increasing, strict) (True, True): operator.gt, (False, True): operator.lt, (True, False): operator.ge, (False, False): operator.le, (None, True): (operator.gt, operator.lt), (None, False): (operator.ge, operator.le), } bad = pd.DataFrame() for col, (increasing, strict) in items.items(): ser_diff = df[col].diff().dropna() op = operator_choices[(increasing, strict)] if increasing is None: ser_diff_incr = op[0](ser_diff, 0) ser_diff_dec = op[1](ser_diff, 0) if not ser_diff_incr.all() | ser_diff_dec.all(): bad[ser_diff.name] = ~ser_diff_incr | ~ser_diff_dec else: bad[ser_diff.name] = ~op(ser_diff, 0) if np.any(bad): msg = bad_locations(bad) raise AssertionError(msg) return df
[docs]def is_shape(df, shape): """Asserts that `df` is of a known row x column `shape`. Args: df (pd.DataFrame): Any pd.DataFrame. shape (tuple): Shape of `df` as (n_rows, n_columns). Use None or -1 if you don't care about a specific dimension. Returns: Original `df`. """ try: check = np.all(np.equal(df.shape, shape) | (np.equal(shape, [-1, -1]) | np.equal(shape, [None, None]))) assert check except AssertionError as e: msg = ("Expected shape: {}\n" "\t\tActual shape: {}".format(shape, df.shape)) e.args = (msg,) raise return df
[docs]def unique(df, columns=None): """Asserts that columns in `df` only have unique values. Args: df (pd.DataFrame): Any pd.DataFrame. columns (list): A subset of columns to check for uniqueness of row values. Returns: Original `df`. """ if columns is None: columns = df.columns for col in columns: if not df[col].is_unique: raise AssertionError("Column {!r} contains non-unique values".format(col)) return df
[docs]def within_set(df, items=None): """Deprecated: replaced with has_vals_within_set""" warnings.warn("This function has been renamed to has_vals_within_set. " "The old name will be removed in 0.7.", DeprecationWarning, stacklevel=1) return has_vals_within_set(df, items)
[docs]def has_vals_within_set(df, items=None): """Asserts that `df` is a subset of items. Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns (col) to array-like of values (v) that ``df[col]`` is expected to be a subset of. Returns: Original `df`. """ for col, v in items.items(): if not df[col].isin(v).all(): bad = df.loc[~df[col].isin(v), col] raise AssertionError('Not in set', bad) return df
[docs]def within_range(df, items=None): """Deprecated: Replaced with has_vals_within_range""" warnings.warn("This function has been renamed to has_vals_within_range. " "The old name will be removed in 0.7.", DeprecationWarning, stacklevel=1) return has_vals_within_range(df, items)
[docs]def has_vals_within_range(df, items=None): """Asserts that `df` is within a range. Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns (col) to a (low, high) tuple (v) that ``df[col]`` is expected to be between. Returns: Original `df`. Examples: The following check will pass, since df['a'] contains values between 0 and 3: >>> import bulwark.checks as ck >>> import pandas as pd >>> df = pd.DataFrame({'a': [1, 2, 3], 'b': ['a', 'b', 'c']}) >>> ck.has_vals_within_range(df, items= {'a': (0, 3)}) a b 0 1 a 1 2 b 2 3 c The following check will fail, since df['b'] contains 'c' which is outside of the specified range: >>> ck.has_vals_within_range(df, items= {'a': (0, 3), 'b': ('a', 'b')}) Traceback (most recent call last): ... AssertionError: ('Outside range', 0 False 1 False 2 True Name: b, dtype: bool) """ for col, (lower, upper) in items.items(): if (lower > df[col]).any() or (upper < df[col]).any(): bad = (lower > df[col]) | (upper < df[col]) raise AssertionError("Outside range", bad) return df
[docs]def within_n_std(df, n=3): """Deprecated: replaced with has_vals_within_n_std""" warnings.warn("This function has been renamed to has_vals_within_n_std. " "The old name will be removed in 0.7.", DeprecationWarning, stacklevel=1) return has_vals_within_n_std(df, n)
[docs]def has_vals_within_n_std(df, n=3): """Asserts that every value is within ``n`` standard deviations of its column's mean. Args: df (pd.DataFrame): Any pd.DataFrame. n (int): Number of standard deviations from the mean. Returns: Original `df`. """ means = df.mean() stds = df.std() inliers = (np.abs(df[means.index] - means) < n * stds) if not np.all(inliers): msg = bad_locations(~inliers) raise AssertionError(msg) return df
[docs]def has_dtypes(df, items): """Asserts that `df` has ``dtypes`` Args: df (pd.DataFrame): Any pd.DataFrame. items (dict): Mapping of columns to dtype. Returns: Original `df`. """ dtypes = df.dtypes for col, dtype in items.items(): if not dtypes[col] == dtype: raise AssertionError("{} has the wrong dtype. Should be ({}), is ({})" .format(col, dtype, dtypes[col])) return df
[docs]def one_to_many(df, unitcol, manycol): """Asserts that a many-to-one relationship is preserved between two columns. For example, a retail store will have have distinct departments, each with several employees. If each employee may only work in a single department, then the relationship of the department to the employees is one to many. Args: df (pd.DataFrame): Any pd.DataFrame. unitcol (str): The column that encapulates the groups in ``manycol``. manycol (str): The column that must remain unique in the distict pairs between ``manycol`` and ``unitcol``. Returns: Original `df`. """ subset = df[[manycol, unitcol]].drop_duplicates() for many in subset[manycol].unique(): if subset[subset[manycol] == many].shape[0] > 1: msg = ("{} in {} has multiple values for {}" .format(many, manycol, unitcol)) raise AssertionError(msg) return df
[docs]def is_same_as(df, df_to_compare, **kwargs): """Asserts that two pd.DataFrames are equal. Args: df (pd.DataFrame): Any pd.DataFrame. df_to_compare (pd.DataFrame): A second pd.DataFrame. **kwargs (dict): Keyword arguments passed through to pandas' ``assert_frame_equal``. Returns: Original `df`. """ try: tm.assert_frame_equal(df, df_to_compare, **kwargs) except AssertionError as exc: raise AssertionError("DataFrames are not equal") from exc return df
[docs]def multi_check(df, checks, warn=False): """Asserts that all checks pass. Args: df (pd.DataFrame): Any pd.DataFrame. checks (dict): Mapping of check functions to parameters for those check functions. warn (bool): Indicates whether an error should be raised or only a warning notification should be displayed. Default is to error. Returns: Original `df`. """ error_msgs = [] for func, params in checks.items(): try: func(df, **params) except AssertionError as e: error_msgs.append(e) if warn and error_msgs: print(error_msgs) return df elif error_msgs: raise AssertionError("\n".join(str(i) for i in error_msgs)) return df
[docs]def custom_check(df, check_func, *args, **kwargs): """Assert that `check(df, *args, **kwargs)` is true. Args: df (pd.DataFrame): Any pd.DataFrame. check_func (function): A function taking `df`, `*args`, and `**kwargs`. Should raise AssertionError if check not passed. Returns: Original `df`. """ try: check_func(df, *args, **kwargs) except AssertionError as e: msg = "{} is not true.".format(check_func.__name__) e.args = (msg,) raise return df