Check

Source code in cuallee/__init__.py

class Check:
    def __init__(
        self,
        level: Union[CheckLevel, int] = 0,
        name: str = "cuallee.check",
        *,
        execution_date: datetime = datetime.now(timezone.utc),
        table_name: str = None,
        session: Any = None,
    ):
        """
        A container of data quality rules.

        Args:
            level (CheckLevel): [0-1] value to describe if its a WARNING or ERROR check
            name (str): Normally the name of the dataset being verified, or a name for this check
            execution_date (date): An automatically generated timestamp of the check in UTC
            table_name (str): When using databases matches the table name of the source
            session (Session): When operating in Session enabled environments like Databricks or Snowflake

        """
        self._rule: Dict[str, Rule] = {}
        self.compute_engine: ModuleType

        if isinstance(level, int):
            # When the user is lazy and wants to do WARN=0, or ERR=1
            level = CheckLevel(level)

        self.level = level
        self.name = name
        self.date = execution_date
        self.rows = -1
        self.config: Dict[str, str] = {}
        self.table_name = table_name
        self.dtype = "cuallee.dataframe"
        try:
            from .iso.checks import ISO
            from .bio.checks import BioChecks

            self.iso = ISO(self)
            self.bio = BioChecks(self)
        except (ModuleNotFoundError, ImportError) as err:
            logger.error(f"Dependency modules missing: {str(err)}")
        self.session = session

    def __repr__(self):
        standard = (
            f"Check(level:{self.level}, description:{self.name}, rules:{self.sum})"
        )
        if self.table_name:
            standard += f" / table:{self.table_name}"
        return standard

    @property
    def sum(self):
        """Total number of rules in Check"""
        return len(self._rule.keys())

    @property
    def rules(self):
        """Returns all rules defined for check"""
        return list(self._rule.values())

    @property
    def keys(self):
        """Returns blake2s unique identifiers of rules"""
        return list(self._rule.keys())

    @property
    def empty(self):
        """True when no rules are added in the check"""
        return len(self.rules) == 0

    def _remove_rule_generic(self, key: str):
        """
        Remove a key from rules and compute dictionaries

        Args:
            key (str): the blake2s key of the rule
        """
        if key in self._rule:
            self._rule.pop(key)

    def add_rule(self, method: str, *arg, **kwargs):
        """
        Add a new rule to the Check class.

        Args:
            method (str): Check name
            arg (list): Parameters of the Rule
            kwars (dict): Dictionary of options for the Rule
        """
        return operator.methodcaller(method, *arg, **kwargs)(self)

    def delete_rule_by_key(self, keys: Union[str, List[str]]):
        """
        Delete rules from check based on keys.

        Args:
            keys (List[str]): a single or list of keys to remove from the check
        """
        if isinstance(keys, str):
            keys = [keys]

        [self._remove_rule_generic(key) for key in keys]
        return self

    def delete_rule_by_attribute(
        self,
        rule_attribute: Literal["method", "column", "coverage"],
        values: Union[List[str], List[float]],
    ):
        """
        Delete rule based on method(s) or column name(s) or coverage value(s).

        Args:
            rule_attribute (str): Finds a rule with by: method, column or coverage
            values (List[str]): Deletes a rule that matches the rule_attribute equal to the value in this parameter
        """
        if not isinstance(values, List):
            values = [values]

        _filter = lambda x: operator.attrgetter(rule_attribute)(x) in values

        [
            self._remove_rule_generic(key)
            for key in valfilter(_filter, self._rule).keys()
        ]
        return self

    def adjust_rule_coverage(self, rule_index: int, rule_coverage: float):
        """
        Adjust the ratio predicate/rows for a rule.
        It is intended to lower or increase tolerance without having to rewrite the entire check

        Args:
            rule_index (int): The position of the rule in the check list
            rule_coverage (float): New value between [0..1] for tolerance

        """
        target_rule = self.rules[rule_index]
        old_key = target_rule.key
        target_rule = self._rule.pop(old_key)
        target_rule.coverage = rule_coverage
        target_rule >> self._rule
        return self

    def is_complete(self, column: str, pct: float = 1.0):
        """
        Validation for non-null values in column

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass

        """
        Rule("is_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
        return self

    def is_empty(self, column: str, pct: float = 1.0):
        """
        Validation for null values in column

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass

        """
        Rule("is_empty", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
        return self

    def are_complete(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
        """
        Validation for non-null values in a group of columns

        Args:
            column (List[str]): A tuple or list of column names in dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("are_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
        return self

    def is_unique(
        self,
        column: str,
        pct: float = 1.0,
        approximate: bool = False,
        ignore_nulls: bool = False,
    ):
        """
        Validation for unique values in column

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
            approximate (bool): A flag to speed up computation using an approximation through maximum relative std. dev.
            ignore_nulls (bool): Run drop nulls before counting
        """
        (
            Rule(
                "is_unique",
                column,
                "N/A",
                CheckDataType.AGNOSTIC,
                pct,
                options={"approximate": approximate, "ignore_nulls": ignore_nulls},
            )
            >> self._rule
        )
        return self

    def is_primary_key(self, column: str, pct: float = 1.0):
        """
        Validation for unique values in column

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "is_unique",
                column,
                "N/A",
                CheckDataType.AGNOSTIC,
                pct,
                options={"name": "is_primary_key"},
            )
            >> self._rule
        )
        return self

    def are_unique(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
        """
        Validation for unique values in a group of columns

        Args:
            column (List[str]): A tuple or list of column names in dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("are_unique", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
        return self

    def is_composite_key(
        self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0
    ):
        """
        Validation for unique values in a group of columns

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "are_unique",
                column,
                "N/A",
                CheckDataType.AGNOSTIC,
                pct,
                options={"name": "is_composite_key"},
            )
            >> self._rule
        )
        return self

    def is_greater_than(self, column: str, value: float, pct: float = 1.0):
        """
        Validation for numeric greater than value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        Rule("is_greater_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
        return self

    def is_positive(self, column: str, pct: float = 1.0):
        """
        Validation for numeric greater than zero

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_greater_than(column, 0, pct)

    def is_greater_or_equal_than(self, column: str, value: float, pct: float = 1.0):
        """
        Validation for numeric greater or equal than value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        (
            Rule("is_greater_or_equal_than", column, value, CheckDataType.NUMERIC, pct)
            >> self._rule
        )
        return self

    def is_in_millions(self, column: str, pct: float = 1.0):
        """
        Validates that a column has values greater than 1M (1e6)

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_greater_or_equal_than(column, 1e6, pct)

    def is_in_billions(self, column: str, pct: float = 1.0):
        """
        Validates that a column has values greater than 1B (1e9)

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_greater_or_equal_than(column, 1e9, pct)

    def is_less_than(self, column: str, value: float, pct: float = 1.0):
        """
        Validation for numeric less than value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        Rule("is_less_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
        return self

    def is_negative(self, column: str, pct: float = 1.0):
        """
        Validation for numeric less than zero

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_less_than(column, 0, pct)

    def is_less_or_equal_than(self, column: str, value: float, pct: float = 1.0):
        """
        Validation for numeric less or equal than value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        (
            Rule("is_less_or_equal_than", column, value, CheckDataType.NUMERIC, pct)
            >> self._rule
        )
        return self

    def is_equal_than(self, column: str, value: float, pct: float = 1.0):
        """
        Validation for numeric column equal than value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        Rule("is_equal_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
        return self

    def has_pattern(
        self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}
    ):
        """
        Validation for string type column matching regex expression

        Args:
            column (str): Column name in dataframe
            value (regex): A regular expression used to  match values in the `column`
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "has_pattern", column, value, CheckDataType.STRING, pct, options=options
            )
            >> self._rule
        )
        return self

    def is_legit(self, column: str, pct: float = 1.0):
        """
        Validation for string columns giving wrong signal about completeness due to empty strings.

        Useful for reading CSV files and preventing empty strings being reported as valid records.
        This is an `alias` implementation of the `has_pattern` rule using `not black space` as the pattern
        Which validates the presence of non-empty characters between the begining and end of a string.

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "has_pattern",
                column,
                r"^\S+$",
                CheckDataType.STRING,
                pct,
                options={"name": "is_legit"},
            )
            >> self._rule
        )
        return self

    def has_min(self, column: str, value: float):
        """
        Validation of a column's minimum value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
        """
        Rule("has_min", column, value, CheckDataType.NUMERIC) >> self._rule
        return self

    def has_max(self, column: str, value: float):
        """
        Validation of a column's maximum value

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
        """
        Rule("has_max", column, value, CheckDataType.NUMERIC) >> self._rule
        return self

    def has_std(self, column: str, value: float):
        """
        Validation of a column's standard deviation

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
        """
        Rule("has_std", column, value, CheckDataType.NUMERIC) >> self._rule
        return self

    def has_mean(self, column: str, value: float):
        """
        Validation of a column's average/mean

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
        """
        Rule("has_mean", column, value, CheckDataType.NUMERIC) >> self._rule
        return self

    def has_sum(self, column: str, value: float):
        """
        Validation of a sum of all values of a column

        Args:
            column (str): Column name in dataframe
            value (number): The condition for the column to match
        """
        Rule("has_sum", column, value, CheckDataType.NUMERIC) >> self._rule
        return self

    def is_between(self, column: str, value: Tuple[Any], pct: float = 1.0):
        """
        Validation of a column between a range

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        Rule("is_between", column, value, CheckDataType.AGNOSTIC, pct) >> self._rule
        return self

    def not_contained_in(
        self,
        column: str,
        value: Union[List, Tuple],
        pct: float = 1.0,
    ):
        """
        Validation of column value not in set of given values

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        (
            Rule("not_contained_in", column, value, CheckDataType.AGNOSTIC, pct)
            >> self._rule
        )

        return self

    def not_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
        """
        Validation of column value not in set of given values

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        return self.not_contained_in(column, value, pct)

    def is_contained_in(
        self,
        column: str,
        value: Union[List, Tuple],
        pct: float = 1.0,
        options: Dict[str, str] = {},
    ):
        """
        Validation of column value in set of given values

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """

        (
            Rule(
                "is_contained_in",
                column,
                value,
                CheckDataType.AGNOSTIC,
                pct,
                options=options,
            )
            >> self._rule
        )

        return self

    def is_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
        """
        Vaildation of column value in set of given values

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            pct (float): The threshold percentage required to pass
        """
        return self.is_contained_in(column, value, pct, options={"name": "is_in"})

    def is_t_minus_n(
        self,
        column: str,
        value: int,
        pct: float = 1.0,
        options: Dict[str, str] = {"name": "is_t_minus_n"},
    ):
        """
        Validate that date is `n` days before the current date

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The number of days before the current date
            pct (float): The threshold percentage required to pass
        """
        yesterday = datetime.utcnow() - timedelta(days=value)
        return self.is_contained_in(
            column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options
        )

    def is_t_minus_1(self, column: str, pct: float = 1.0):
        """
        Validate that date is yesterday

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_t_minus_n(column, 1, pct, options={"name": "is_t_minus_1"})

    def is_t_minus_2(self, column: str, pct: float = 1.0):
        """
        Validate that date is 2 days ago

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_t_minus_n(column, 2, pct, options={"name": "is_t_minus_2"})

    def is_t_minus_3(self, column: str, pct: float = 1.0):
        """
        Validate that date is 3 days ago

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_t_minus_n(column, 3, pct, options={"name": "is_t_minus_3"})

    def is_yesterday(self, column: str, pct: float = 1.0):
        """
        Validate that date is yesterday

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_t_minus_n(column, 1, pct, options={"name": "is_yesterday"})

    def is_today(self, column: str, pct: float = 1.0):
        """
        Validate that date is today

        Args:
            column (str): Column name in dataframe
            pct (float): The threshold percentage required to pass
        """
        return self.is_t_minus_n(column, 0, pct, options={"name": "is_today"})

    def has_percentile(
        self, column: str, value: float, percentile: float, precision: int = 10000
    ):
        """
        Validation of a column percentile value using approximation

        Args:
            column (str): Column name in dataframe
            value (List[str,number,date]): The condition for the column to match
            percentile (float): Value between [0,1] i.e. `0.5` for median
            precision (float): The precision to calculate percentiles

        """
        (
            Rule(
                "has_percentile",
                column,
                value,
                CheckDataType.NUMERIC,
                options=[
                    tuple(["percentile", percentile]),
                    tuple(["precision", precision]),
                ],
            )
            >> self._rule
        )
        return self

    def is_inside_interquartile_range(
        self, column: str, value: List[float] = [0.25, 0.75], pct: float = 1.0
    ):
        """
        Validates a number resides inside the quartile(1) and quartile(3) of the range of values

        Args:
            column (str): Column name in dataframe
            value (List[number]): A number between 0 and 1 demarking the quartile
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "is_inside_interquartile_range",
                column,
                value,
                CheckDataType.NUMERIC,
                pct,
            )
            >> self._rule
        )
        return self

    def has_max_by(
        self, column_source: str, column_target: str, value: Union[float, str]
    ):
        """
        Validation the correspondance of a column value based on another column maximum

        Args:
            column_source (str): Column used to obtain the row with the max value
            column_target (str): Column used to verify the matching value
            value (str,number): The value to match against
        """
        (
            Rule(
                "has_max_by",
                [column_source, column_target],
                value,
                CheckDataType.DUO,
            )
            >> self._rule
        )
        return self

    def has_min_by(
        self, column_source: str, column_target: str, value: Union[float, str]
    ):
        """
        Validation the correspondence of a column value based on another column minimum

        Args:
            column_source (str): Column used to obtain the row with the min value
            column_target (str): Column used to verify the matching value
            value (str,number): The value to match against
        """
        (
            Rule(
                "has_min_by",
                [column_source, column_target],
                value,
                CheckDataType.DUO,
            )
            >> self._rule
        )
        return self

    def has_correlation(self, column_left: str, column_right: str, value: float):
        """
        Validates the correlation in a range of [0..1] between 2 columns

        Args:
            column_left (str): Column name in dataframe
            column_right (str): Column name in dataframe
            value (float): Value to match the correlation
        """
        (
            Rule(
                "has_correlation",
                [column_left, column_right],
                value,
                CheckDataType.NUMERIC,
            )
            >> self._rule
        )
        return self

    def satisfies(
        self,
        column: str,
        predicate: str,
        pct: float = 1.0,
        options: Dict[str, str] = {},
    ):
        """
        Validation of a column satisfying a SQL-like predicate

        Args:
            column (str): Column name in the dataframe
            predicate (str): A predicate written in SQL-like syntax
            pct (float): The threshold percentage required to pass
        """
        (
            Rule(
                "satisfies",
                column,
                predicate,
                CheckDataType.AGNOSTIC,
                pct,
                options=options,
            )
            >> self._rule
        )
        return self

    def has_cardinality(self, column: str, value: int):
        """
        Validates the number of distinct values in a column

        Args:
            column (str): Column name in the dataframe
            value (int): The number of expected distinct values on a column
        """
        Rule("has_cardinality", column, value, CheckDataType.AGNOSTIC) >> self._rule
        return self

    def has_infogain(self, column: str, pct: float = 1.0):
        """
        Validate cardinality > 1.
        Particularly useful when validating categorical data for Machine Learning

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass

        """
        (
            Rule(
                method="has_infogain",
                column=column,
                value="N/A",
                data_type=CheckDataType.AGNOSTIC,
                coverage=pct,
            )
            >> self._rule
        )
        return self

    def has_entropy(self, column: str, value: float, tolerance: float = 0.01):
        """
        Validation for entropy calculation on continuous variables/features on `log2`.
        Useful in Machine Learning classifications to test imbalanced datasets with low entropy.

        Args:
            column (str): Column name in the dataframe
            value (float): The expected entropy value
            tolerance (float): The tolerance/precision used when comparing the actual and expected value

        Examples:

        """
        (
            Rule(
                "has_entropy",
                column,
                value,
                CheckDataType.AGNOSTIC,
                options=[tuple(["tolerance", tolerance])],
            )
            >> self._rule
        )
        return self

    def is_on_weekday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is in a Mon-Fri time range

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_weekday", column, "Mon-Fri", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_weekend(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is in a Sat-Sun time range

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_weekend", column, "Sat-Sun", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_monday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Monday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_monday", column, "Mon", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_tuesday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Tuesday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_tuesday", column, "Tue", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_wednesday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Wednesday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_wednesday", column, "Wed", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_thursday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Thursday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_thursday", column, "Thu", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_friday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Friday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_friday", column, "Fri", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_saturday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Saturday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_saturday", column, "Sat", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_sunday(self, column: str, pct: float = 1.0):
        """
        Validates a datetime column is on Sunday

        Args:
            column (str): Column name in the dataframe
            pct (float): The threshold percentage required to pass
        """
        Rule("is_on_sunday", column, "Sun", CheckDataType.DATE, pct) >> self._rule
        return self

    def is_on_schedule(self, column: str, value: Tuple[Any], pct: float = 1.0):
        """
        Validation of a datetime column between an hour interval

        Args:
            column (str): Column name in the dataframe
            value (Tuple[int,int]): A tuple indicating a 24hr day interval. i.e. (9,17) for 9am to 5pm
            pct (float): The threshold percentage required to pass
        """
        (
            Rule("is_on_schedule", column, value, CheckDataType.TIMESTAMP, pct)
            >> self._rule
        )
        return self

    def is_daily(
        self, column: str, value: Union[None, List[int]] = None, pct: float = 1.0
    ):
        """
        Validates that there is no missing dates using only week days in the date/timestamp column.

        An alternative day combination can be provided given that a user wants to validate only certain dates.
        For example in PySpark to validate that time series are every Wednesday consecutively on a year
        without any missing values, the value input should contain `[4]` as it represent the numeric
        equivalence of the day of week Wednesday.

        Args:
            column (str): Column name in the dataframe
            value (List[int]): A list of numbers describing the days of the week to consider. i.e. Pyspark uses [2, 3, 4, 5, 6] for Mon-Fri
            pct (float): The threshold percentage required to pass
        """
        (Rule("is_daily", column, value, CheckDataType.DATE, pct) >> self._rule)
        return self

    def has_workflow(
        self,
        column_group: str,
        column_event: str,
        column_order: str,
        edges: List[Tuple[str]],
        pct: float = 1.0,
    ):
        """
        Validates events in a group clause with order, followed a specific sequence. Similar to adjacency matrix validation.

        Args:
            column_group (str): The dataframe column used to group events
            column_event (str): The state of the event within the group
            column_order (List[date,number,str]): The order within the group, should be deterministic and without collisions.
            edges (List[Tuple[str,str]]): The combinations of events expected in the data frame i.e `[("A","B"), ("B","C")]`


        ???+ example "Example"

            Given the following fictitious dataset example:

            | date       | ticket   | status      |
            |------------|----------|-------------|
            | 2024-01-01 | CASE-001 | New         |
            | 2024-01-02 | CASE-001 | In Progress |
            | 2024-01-03 | CASE-001 | Closed      |

            You can validate that events for each ticket follow certain sequence by using:

            ``` python
            from cuallee import Check, CheckLevel
            df = spark.createDataFrame(
                 [
                     ["2024-01-01", "CASE-001", "New"],
                     ["2024-01-02", "CASE-001", "In Progress"],
                     ["2024-01-03", "CASE-001", "Closed"],
                 ],
                 ["date", "ticket", "status"],
             )


            check = Check(CheckLevel.WARNING, "WorkflowValidation")
            check.has_workflow(
                column_group="ticket",
                column_event="status",
                column_order="date",
                edges=[(None, "New"),("New", "In Progress"),("In Progress","Closed"), ("Closed", None)]
            )

            # Validate
            check.validate(df).show(truncate=False)

            # Result
            +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
            |id |timestamp          |check             |level  |column                      |rule        |value                                                                               |rows|violations|pass_rate|pass_threshold|status|
            +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
            |1  |2024-05-11 11:24:00|WorkflowValidation|WARNING|('ticket', 'status', 'date')|has_workflow|((None, 'New'), ('New', 'In Progress'), ('In Progress', 'Closed'), ('Closed', None))|3   |0         |1.0      |1.0           |PASS  |
            +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+

            ```

        The check validates that:

        - Nothing preceeds a `New` state
        - `In Progress` follows the `New` event
        - `Closed` follows the `In Progress` event
        - Nothing follows after `Closed` state

        """
        (
            Rule(
                "has_workflow",
                [column_group, column_event, column_order],
                edges,
                CheckDataType.AGNOSTIC,
                pct,
            )
            >> self._rule
        )
        return self

    def is_custom(
        self,
        column: Union[str, List[str]],
        fn: Callable = None,
        pct: float = 1.0,
        options: Dict[str, str] = {},
    ):
        """
        Uses a user-defined function that receives the to-be-validated dataframe
        and uses the last column of the transformed dataframe to summarize the check

        Args:
            column (str): Column(s) required for custom function
            fn (Callable): A function that receives a dataframe as input and returns a dataframe with at least 1 column as result
            pct (float): The threshold percentage required to pass
        """

        (
            Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options)
            >> self._rule
        )
        return self

    def validate(self, dataframe: Any, ok: bool = False):
        """
        Compute all rules in this check for specific data frame

        Args:
            dataframe (Union[pyspark,snowpark,pandas,polars,duckdb,bigquery]): A dataframe object
        """

        # Stop execution if the there is no rules in the check
        assert not self.empty, "Check is empty. Try adding some rules?"

        self.dtype = first(re.match(r".*'(.*)'", str(type(dataframe))).groups())
        match self.dtype:
            case self.dtype if "pyspark" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.pyspark_validation"
                )
            case self.dtype if "pandas" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.pandas_validation"
                )
            case self.dtype if "snowpark" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.snowpark_validation"
                )
            case self.dtype if "polars" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.polars_validation"
                )
            case self.dtype if "duckdb" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.duckdb_validation"
                )
            case self.dtype if "bigquery" in self.dtype:
                self.compute_engine = importlib.import_module(
                    "cuallee.bigquery_validation"
                )
            case self.dtype if "daft" in self.dtype:
                self.compute_engine = importlib.import_module("cuallee.daft_validation")
            case _:
                raise NotImplementedError(
                    f"{self.dtype} is not yet implemented in cuallee"
                )

        assert self.compute_engine.validate_data_types(
            self.rules, dataframe
        ), "Invalid data types between rules and dataframe"

        if ok:
            result = self.compute_engine.ok(self, dataframe)
        else:
            result = self.compute_engine.summary(self, dataframe)
        return result

    def ok(self, dataframe: Any) -> bool:
        """True when all checks passed"""
        return self.validate(dataframe, ok=True)

`empty` `property`

True when no rules are added in the check

`keys` `property`

Returns blake2s unique identifiers of rules

`rules` `property`

Returns all rules defined for check

`sum` `property`

Total number of rules in Check

`init(level=0, name='cuallee.check', *, execution_date=datetime.now(timezone.utc), table_name=None, session=None)`

A container of data quality rules.

Parameters:

Name	Type	Description	Default
`level`	`CheckLevel`	[0-1] value to describe if its a WARNING or ERROR check	`0`
`name`	`str`	Normally the name of the dataset being verified, or a name for this check	`'cuallee.check'`
`execution_date`	`date`	An automatically generated timestamp of the check in UTC	`now(utc)`
`table_name`	`str`	When using databases matches the table name of the source	`None`
`session`	`Session`	When operating in Session enabled environments like Databricks or Snowflake	`None`

Source code in cuallee/__init__.py

def __init__(
    self,
    level: Union[CheckLevel, int] = 0,
    name: str = "cuallee.check",
    *,
    execution_date: datetime = datetime.now(timezone.utc),
    table_name: str = None,
    session: Any = None,
):
    """
    A container of data quality rules.

    Args:
        level (CheckLevel): [0-1] value to describe if its a WARNING or ERROR check
        name (str): Normally the name of the dataset being verified, or a name for this check
        execution_date (date): An automatically generated timestamp of the check in UTC
        table_name (str): When using databases matches the table name of the source
        session (Session): When operating in Session enabled environments like Databricks or Snowflake

    """
    self._rule: Dict[str, Rule] = {}
    self.compute_engine: ModuleType

    if isinstance(level, int):
        # When the user is lazy and wants to do WARN=0, or ERR=1
        level = CheckLevel(level)

    self.level = level
    self.name = name
    self.date = execution_date
    self.rows = -1
    self.config: Dict[str, str] = {}
    self.table_name = table_name
    self.dtype = "cuallee.dataframe"
    try:
        from .iso.checks import ISO
        from .bio.checks import BioChecks

        self.iso = ISO(self)
        self.bio = BioChecks(self)
    except (ModuleNotFoundError, ImportError) as err:
        logger.error(f"Dependency modules missing: {str(err)}")
    self.session = session

`_remove_rule_generic(key)`

Remove a key from rules and compute dictionaries

Parameters:

Name	Type	Description	Default
`key`	`str`	the blake2s key of the rule	required

Source code in cuallee/__init__.py

def _remove_rule_generic(self, key: str):
    """
    Remove a key from rules and compute dictionaries

    Args:
        key (str): the blake2s key of the rule
    """
    if key in self._rule:
        self._rule.pop(key)

`add_rule(method, *arg, **kwargs)`

Add a new rule to the Check class.

Parameters:

Name	Type	Description	Default
`method`	`str`	Check name	required
`arg`	`list`	Parameters of the Rule	`()`
`kwars`	`dict`	Dictionary of options for the Rule	required

Source code in cuallee/__init__.py

def add_rule(self, method: str, *arg, **kwargs):
    """
    Add a new rule to the Check class.

    Args:
        method (str): Check name
        arg (list): Parameters of the Rule
        kwars (dict): Dictionary of options for the Rule
    """
    return operator.methodcaller(method, *arg, **kwargs)(self)

`adjust_rule_coverage(rule_index, rule_coverage)`

Adjust the ratio predicate/rows for a rule. It is intended to lower or increase tolerance without having to rewrite the entire check

Parameters:

Name	Type	Description	Default
`rule_index`	`int`	The position of the rule in the check list	required
`rule_coverage`	`float`	New value between [0..1] for tolerance	required

Source code in cuallee/__init__.py

def adjust_rule_coverage(self, rule_index: int, rule_coverage: float):
    """
    Adjust the ratio predicate/rows for a rule.
    It is intended to lower or increase tolerance without having to rewrite the entire check

    Args:
        rule_index (int): The position of the rule in the check list
        rule_coverage (float): New value between [0..1] for tolerance

    """
    target_rule = self.rules[rule_index]
    old_key = target_rule.key
    target_rule = self._rule.pop(old_key)
    target_rule.coverage = rule_coverage
    target_rule >> self._rule
    return self

`are_complete(column, pct=1.0)`

Validation for non-null values in a group of columns

Parameters:

Name	Type	Description	Default
`column`	`List[str]`	A tuple or list of column names in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def are_complete(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
    """
    Validation for non-null values in a group of columns

    Args:
        column (List[str]): A tuple or list of column names in dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("are_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
    return self

`are_unique(column, pct=1.0)`

Validation for unique values in a group of columns

Parameters:

Name	Type	Description	Default
`column`	`List[str]`	A tuple or list of column names in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def are_unique(self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0):
    """
    Validation for unique values in a group of columns

    Args:
        column (List[str]): A tuple or list of column names in dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("are_unique", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
    return self

`delete_rule_by_attribute(rule_attribute, values)`

Delete rule based on method(s) or column name(s) or coverage value(s).

Parameters:

Name	Type	Description	Default
`rule_attribute`	`str`	Finds a rule with by: method, column or coverage	required
`values`	`List[str]`	Deletes a rule that matches the rule_attribute equal to the value in this parameter	required

Source code in cuallee/__init__.py

def delete_rule_by_attribute(
    self,
    rule_attribute: Literal["method", "column", "coverage"],
    values: Union[List[str], List[float]],
):
    """
    Delete rule based on method(s) or column name(s) or coverage value(s).

    Args:
        rule_attribute (str): Finds a rule with by: method, column or coverage
        values (List[str]): Deletes a rule that matches the rule_attribute equal to the value in this parameter
    """
    if not isinstance(values, List):
        values = [values]

    _filter = lambda x: operator.attrgetter(rule_attribute)(x) in values

    [
        self._remove_rule_generic(key)
        for key in valfilter(_filter, self._rule).keys()
    ]
    return self

`delete_rule_by_key(keys)`

Delete rules from check based on keys.

Parameters:

Name	Type	Description	Default
`keys`	`List[str]`	a single or list of keys to remove from the check	required

Source code in cuallee/__init__.py

def delete_rule_by_key(self, keys: Union[str, List[str]]):
    """
    Delete rules from check based on keys.

    Args:
        keys (List[str]): a single or list of keys to remove from the check
    """
    if isinstance(keys, str):
        keys = [keys]

    [self._remove_rule_generic(key) for key in keys]
    return self

`has_cardinality(column, value)`

Validates the number of distinct values in a column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`value`	`int`	The number of expected distinct values on a column	required

Source code in cuallee/__init__.py

def has_cardinality(self, column: str, value: int):
    """
    Validates the number of distinct values in a column

    Args:
        column (str): Column name in the dataframe
        value (int): The number of expected distinct values on a column
    """
    Rule("has_cardinality", column, value, CheckDataType.AGNOSTIC) >> self._rule
    return self

`has_correlation(column_left, column_right, value)`

Validates the correlation in a range of [0..1] between 2 columns

Parameters:

Name	Type	Description	Default
`column_left`	`str`	Column name in dataframe	required
`column_right`	`str`	Column name in dataframe	required
`value`	`float`	Value to match the correlation	required

Source code in cuallee/__init__.py

def has_correlation(self, column_left: str, column_right: str, value: float):
    """
    Validates the correlation in a range of [0..1] between 2 columns

    Args:
        column_left (str): Column name in dataframe
        column_right (str): Column name in dataframe
        value (float): Value to match the correlation
    """
    (
        Rule(
            "has_correlation",
            [column_left, column_right],
            value,
            CheckDataType.NUMERIC,
        )
        >> self._rule
    )
    return self

`has_entropy(column, value, tolerance=0.01)`

Validation for entropy calculation on continuous variables/features on log2. Useful in Machine Learning classifications to test imbalanced datasets with low entropy.

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`value`	`float`	The expected entropy value	required
`tolerance`	`float`	The tolerance/precision used when comparing the actual and expected value	`0.01`

Examples:

Source code in cuallee/__init__.py

def has_entropy(self, column: str, value: float, tolerance: float = 0.01):
    """
    Validation for entropy calculation on continuous variables/features on `log2`.
    Useful in Machine Learning classifications to test imbalanced datasets with low entropy.

    Args:
        column (str): Column name in the dataframe
        value (float): The expected entropy value
        tolerance (float): The tolerance/precision used when comparing the actual and expected value

    Examples:

    """
    (
        Rule(
            "has_entropy",
            column,
            value,
            CheckDataType.AGNOSTIC,
            options=[tuple(["tolerance", tolerance])],
        )
        >> self._rule
    )
    return self

`has_infogain(column, pct=1.0)`

Validate cardinality > 1. Particularly useful when validating categorical data for Machine Learning

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def has_infogain(self, column: str, pct: float = 1.0):
    """
    Validate cardinality > 1.
    Particularly useful when validating categorical data for Machine Learning

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass

    """
    (
        Rule(
            method="has_infogain",
            column=column,
            value="N/A",
            data_type=CheckDataType.AGNOSTIC,
            coverage=pct,
        )
        >> self._rule
    )
    return self

`has_max(column, value)`

Validation of a column's maximum value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required

Source code in cuallee/__init__.py

def has_max(self, column: str, value: float):
    """
    Validation of a column's maximum value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
    """
    Rule("has_max", column, value, CheckDataType.NUMERIC) >> self._rule
    return self

`has_max_by(column_source, column_target, value)`

Validation the correspondance of a column value based on another column maximum

Parameters:

Name	Type	Description	Default
`column_source`	`str`	Column used to obtain the row with the max value	required
`column_target`	`str`	Column used to verify the matching value	required
`value`	`(str, number)`	The value to match against	required

Source code in cuallee/__init__.py

def has_max_by(
    self, column_source: str, column_target: str, value: Union[float, str]
):
    """
    Validation the correspondance of a column value based on another column maximum

    Args:
        column_source (str): Column used to obtain the row with the max value
        column_target (str): Column used to verify the matching value
        value (str,number): The value to match against
    """
    (
        Rule(
            "has_max_by",
            [column_source, column_target],
            value,
            CheckDataType.DUO,
        )
        >> self._rule
    )
    return self

`has_mean(column, value)`

Validation of a column's average/mean

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required

Source code in cuallee/__init__.py

def has_mean(self, column: str, value: float):
    """
    Validation of a column's average/mean

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
    """
    Rule("has_mean", column, value, CheckDataType.NUMERIC) >> self._rule
    return self

`has_min(column, value)`

Validation of a column's minimum value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required

Source code in cuallee/__init__.py

def has_min(self, column: str, value: float):
    """
    Validation of a column's minimum value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
    """
    Rule("has_min", column, value, CheckDataType.NUMERIC) >> self._rule
    return self

`has_min_by(column_source, column_target, value)`

Validation the correspondence of a column value based on another column minimum

Parameters:

Name	Type	Description	Default
`column_source`	`str`	Column used to obtain the row with the min value	required
`column_target`	`str`	Column used to verify the matching value	required
`value`	`(str, number)`	The value to match against	required

Source code in cuallee/__init__.py

def has_min_by(
    self, column_source: str, column_target: str, value: Union[float, str]
):
    """
    Validation the correspondence of a column value based on another column minimum

    Args:
        column_source (str): Column used to obtain the row with the min value
        column_target (str): Column used to verify the matching value
        value (str,number): The value to match against
    """
    (
        Rule(
            "has_min_by",
            [column_source, column_target],
            value,
            CheckDataType.DUO,
        )
        >> self._rule
    )
    return self

`has_pattern(column, value, pct=1.0, options={})`

Validation for string type column matching regex expression

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`regex`	A regular expression used to match values in the `column`	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def has_pattern(
    self, column: str, value: str, pct: float = 1.0, options: Dict[str, str] = {}
):
    """
    Validation for string type column matching regex expression

    Args:
        column (str): Column name in dataframe
        value (regex): A regular expression used to  match values in the `column`
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "has_pattern", column, value, CheckDataType.STRING, pct, options=options
        )
        >> self._rule
    )
    return self

`has_percentile(column, value, percentile, precision=10000)`

Validation of a column percentile value using approximation

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`percentile`	`float`	Value between [0,1] i.e. `0.5` for median	required
`precision`	`float`	The precision to calculate percentiles	`10000`

Source code in cuallee/__init__.py

def has_percentile(
    self, column: str, value: float, percentile: float, precision: int = 10000
):
    """
    Validation of a column percentile value using approximation

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        percentile (float): Value between [0,1] i.e. `0.5` for median
        precision (float): The precision to calculate percentiles

    """
    (
        Rule(
            "has_percentile",
            column,
            value,
            CheckDataType.NUMERIC,
            options=[
                tuple(["percentile", percentile]),
                tuple(["precision", precision]),
            ],
        )
        >> self._rule
    )
    return self

`has_std(column, value)`

Validation of a column's standard deviation

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required

Source code in cuallee/__init__.py

def has_std(self, column: str, value: float):
    """
    Validation of a column's standard deviation

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
    """
    Rule("has_std", column, value, CheckDataType.NUMERIC) >> self._rule
    return self

`has_sum(column, value)`

Validation of a sum of all values of a column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required

Source code in cuallee/__init__.py

def has_sum(self, column: str, value: float):
    """
    Validation of a sum of all values of a column

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
    """
    Rule("has_sum", column, value, CheckDataType.NUMERIC) >> self._rule
    return self

`has_workflow(column_group, column_event, column_order, edges, pct=1.0)`

Validates events in a group clause with order, followed a specific sequence. Similar to adjacency matrix validation.

Parameters:

Name	Type	Description	Default
`column_group`	`str`	The dataframe column used to group events	required
`column_event`	`str`	The state of the event within the group	required
`column_order`	`List[date, number, str]`	The order within the group, should be deterministic and without collisions.	required
`edges`	`List[Tuple[str, str]]`	The combinations of events expected in the data frame i.e `[("A","B"), ("B","C")]`	required

Example

Given the following fictitious dataset example:

date	ticket	status
2024-01-01	CASE-001	New
2024-01-02	CASE-001	In Progress
2024-01-03	CASE-001	Closed

You can validate that events for each ticket follow certain sequence by using:

from cuallee import Check, CheckLevel
df = spark.createDataFrame(
     [
         ["2024-01-01", "CASE-001", "New"],
         ["2024-01-02", "CASE-001", "In Progress"],
         ["2024-01-03", "CASE-001", "Closed"],
     ],
     ["date", "ticket", "status"],
 )


check = Check(CheckLevel.WARNING, "WorkflowValidation")
check.has_workflow(
    column_group="ticket",
    column_event="status",
    column_order="date",
    edges=[(None, "New"),("New", "In Progress"),("In Progress","Closed"), ("Closed", None)]
)

# Validate
check.validate(df).show(truncate=False)

# Result
+---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
|id |timestamp          |check             |level  |column                      |rule        |value                                                                               |rows|violations|pass_rate|pass_threshold|status|
+---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
|1  |2024-05-11 11:24:00|WorkflowValidation|WARNING|('ticket', 'status', 'date')|has_workflow|((None, 'New'), ('New', 'In Progress'), ('In Progress', 'Closed'), ('Closed', None))|3   |0         |1.0      |1.0           |PASS  |
+---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+

The check validates that:

Nothing preceeds a New state
In Progress follows the New event
Closed follows the In Progress event
Nothing follows after Closed state

Source code in cuallee/__init__.py

def has_workflow(
    self,
    column_group: str,
    column_event: str,
    column_order: str,
    edges: List[Tuple[str]],
    pct: float = 1.0,
):
    """
    Validates events in a group clause with order, followed a specific sequence. Similar to adjacency matrix validation.

    Args:
        column_group (str): The dataframe column used to group events
        column_event (str): The state of the event within the group
        column_order (List[date,number,str]): The order within the group, should be deterministic and without collisions.
        edges (List[Tuple[str,str]]): The combinations of events expected in the data frame i.e `[("A","B"), ("B","C")]`


    ???+ example "Example"

        Given the following fictitious dataset example:

        | date       | ticket   | status      |
        |------------|----------|-------------|
        | 2024-01-01 | CASE-001 | New         |
        | 2024-01-02 | CASE-001 | In Progress |
        | 2024-01-03 | CASE-001 | Closed      |

        You can validate that events for each ticket follow certain sequence by using:

        ``` python
        from cuallee import Check, CheckLevel
        df = spark.createDataFrame(
             [
                 ["2024-01-01", "CASE-001", "New"],
                 ["2024-01-02", "CASE-001", "In Progress"],
                 ["2024-01-03", "CASE-001", "Closed"],
             ],
             ["date", "ticket", "status"],
         )


        check = Check(CheckLevel.WARNING, "WorkflowValidation")
        check.has_workflow(
            column_group="ticket",
            column_event="status",
            column_order="date",
            edges=[(None, "New"),("New", "In Progress"),("In Progress","Closed"), ("Closed", None)]
        )

        # Validate
        check.validate(df).show(truncate=False)

        # Result
        +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
        |id |timestamp          |check             |level  |column                      |rule        |value                                                                               |rows|violations|pass_rate|pass_threshold|status|
        +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+
        |1  |2024-05-11 11:24:00|WorkflowValidation|WARNING|('ticket', 'status', 'date')|has_workflow|((None, 'New'), ('New', 'In Progress'), ('In Progress', 'Closed'), ('Closed', None))|3   |0         |1.0      |1.0           |PASS  |
        +---+-------------------+------------------+-------+----------------------------+------------+------------------------------------------------------------------------------------+----+----------+---------+--------------+------+

        ```

    The check validates that:

    - Nothing preceeds a `New` state
    - `In Progress` follows the `New` event
    - `Closed` follows the `In Progress` event
    - Nothing follows after `Closed` state

    """
    (
        Rule(
            "has_workflow",
            [column_group, column_event, column_order],
            edges,
            CheckDataType.AGNOSTIC,
            pct,
        )
        >> self._rule
    )
    return self

`is_between(column, value, pct=1.0)`

Validation of a column between a range

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_between(self, column: str, value: Tuple[Any], pct: float = 1.0):
    """
    Validation of a column between a range

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    Rule("is_between", column, value, CheckDataType.AGNOSTIC, pct) >> self._rule
    return self

`is_complete(column, pct=1.0)`

Validation for non-null values in column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_complete(self, column: str, pct: float = 1.0):
    """
    Validation for non-null values in column

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass

    """
    Rule("is_complete", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
    return self

`is_composite_key(column, pct=1.0)`

Validation for unique values in a group of columns

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_composite_key(
    self, column: Union[List[str], Tuple[str, str]], pct: float = 1.0
):
    """
    Validation for unique values in a group of columns

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "are_unique",
            column,
            "N/A",
            CheckDataType.AGNOSTIC,
            pct,
            options={"name": "is_composite_key"},
        )
        >> self._rule
    )
    return self

`is_contained_in(column, value, pct=1.0, options={})`

Validation of column value in set of given values

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_contained_in(
    self,
    column: str,
    value: Union[List, Tuple],
    pct: float = 1.0,
    options: Dict[str, str] = {},
):
    """
    Validation of column value in set of given values

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """

    (
        Rule(
            "is_contained_in",
            column,
            value,
            CheckDataType.AGNOSTIC,
            pct,
            options=options,
        )
        >> self._rule
    )

    return self

`is_custom(column, fn=None, pct=1.0, options={})`

Uses a user-defined function that receives the to-be-validated dataframe and uses the last column of the transformed dataframe to summarize the check

Parameters:

Name	Type	Description	Default
`column`	`str`	Column(s) required for custom function	required
`fn`	`Callable`	A function that receives a dataframe as input and returns a dataframe with at least 1 column as result	`None`
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_custom(
    self,
    column: Union[str, List[str]],
    fn: Callable = None,
    pct: float = 1.0,
    options: Dict[str, str] = {},
):
    """
    Uses a user-defined function that receives the to-be-validated dataframe
    and uses the last column of the transformed dataframe to summarize the check

    Args:
        column (str): Column(s) required for custom function
        fn (Callable): A function that receives a dataframe as input and returns a dataframe with at least 1 column as result
        pct (float): The threshold percentage required to pass
    """

    (
        Rule("is_custom", column, fn, CheckDataType.AGNOSTIC, pct, options=options)
        >> self._rule
    )
    return self

`is_daily(column, value=None, pct=1.0)`

Validates that there is no missing dates using only week days in the date/timestamp column.

An alternative day combination can be provided given that a user wants to validate only certain dates. For example in PySpark to validate that time series are every Wednesday consecutively on a year without any missing values, the value input should contain [4] as it represent the numeric equivalence of the day of week Wednesday.

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`value`	`List[int]`	A list of numbers describing the days of the week to consider. i.e. Pyspark uses [2, 3, 4, 5, 6] for Mon-Fri	`None`
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_daily(
    self, column: str, value: Union[None, List[int]] = None, pct: float = 1.0
):
    """
    Validates that there is no missing dates using only week days in the date/timestamp column.

    An alternative day combination can be provided given that a user wants to validate only certain dates.
    For example in PySpark to validate that time series are every Wednesday consecutively on a year
    without any missing values, the value input should contain `[4]` as it represent the numeric
    equivalence of the day of week Wednesday.

    Args:
        column (str): Column name in the dataframe
        value (List[int]): A list of numbers describing the days of the week to consider. i.e. Pyspark uses [2, 3, 4, 5, 6] for Mon-Fri
        pct (float): The threshold percentage required to pass
    """
    (Rule("is_daily", column, value, CheckDataType.DATE, pct) >> self._rule)
    return self

`is_empty(column, pct=1.0)`

Validation for null values in column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_empty(self, column: str, pct: float = 1.0):
    """
    Validation for null values in column

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass

    """
    Rule("is_empty", column, "N/A", CheckDataType.AGNOSTIC, pct) >> self._rule
    return self

`is_equal_than(column, value, pct=1.0)`

Validation for numeric column equal than value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_equal_than(self, column: str, value: float, pct: float = 1.0):
    """
    Validation for numeric column equal than value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    Rule("is_equal_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
    return self

`is_greater_or_equal_than(column, value, pct=1.0)`

Validation for numeric greater or equal than value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_greater_or_equal_than(self, column: str, value: float, pct: float = 1.0):
    """
    Validation for numeric greater or equal than value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    (
        Rule("is_greater_or_equal_than", column, value, CheckDataType.NUMERIC, pct)
        >> self._rule
    )
    return self

`is_greater_than(column, value, pct=1.0)`

Validation for numeric greater than value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_greater_than(self, column: str, value: float, pct: float = 1.0):
    """
    Validation for numeric greater than value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    Rule("is_greater_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
    return self

`is_in(column, value, pct=1.0)`

Vaildation of column value in set of given values

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
    """
    Vaildation of column value in set of given values

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    return self.is_contained_in(column, value, pct, options={"name": "is_in"})

`is_in_billions(column, pct=1.0)`

Validates that a column has values greater than 1B (1e9)

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_in_billions(self, column: str, pct: float = 1.0):
    """
    Validates that a column has values greater than 1B (1e9)

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_greater_or_equal_than(column, 1e9, pct)

`is_in_millions(column, pct=1.0)`

Validates that a column has values greater than 1M (1e6)

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_in_millions(self, column: str, pct: float = 1.0):
    """
    Validates that a column has values greater than 1M (1e6)

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_greater_or_equal_than(column, 1e6, pct)

`is_inside_interquartile_range(column, value=[0.25, 0.75], pct=1.0)`

Validates a number resides inside the quartile(1) and quartile(3) of the range of values

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[number]`	A number between 0 and 1 demarking the quartile	`[0.25, 0.75]`
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_inside_interquartile_range(
    self, column: str, value: List[float] = [0.25, 0.75], pct: float = 1.0
):
    """
    Validates a number resides inside the quartile(1) and quartile(3) of the range of values

    Args:
        column (str): Column name in dataframe
        value (List[number]): A number between 0 and 1 demarking the quartile
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "is_inside_interquartile_range",
            column,
            value,
            CheckDataType.NUMERIC,
            pct,
        )
        >> self._rule
    )
    return self

`is_legit(column, pct=1.0)`

Validation for string columns giving wrong signal about completeness due to empty strings.

Useful for reading CSV files and preventing empty strings being reported as valid records. This is an alias implementation of the has_pattern rule using not black space as the pattern Which validates the presence of non-empty characters between the begining and end of a string.

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_legit(self, column: str, pct: float = 1.0):
    """
    Validation for string columns giving wrong signal about completeness due to empty strings.

    Useful for reading CSV files and preventing empty strings being reported as valid records.
    This is an `alias` implementation of the `has_pattern` rule using `not black space` as the pattern
    Which validates the presence of non-empty characters between the begining and end of a string.

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "has_pattern",
            column,
            r"^\S+$",
            CheckDataType.STRING,
            pct,
            options={"name": "is_legit"},
        )
        >> self._rule
    )
    return self

`is_less_or_equal_than(column, value, pct=1.0)`

Validation for numeric less or equal than value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_less_or_equal_than(self, column: str, value: float, pct: float = 1.0):
    """
    Validation for numeric less or equal than value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    (
        Rule("is_less_or_equal_than", column, value, CheckDataType.NUMERIC, pct)
        >> self._rule
    )
    return self

`is_less_than(column, value, pct=1.0)`

Validation for numeric less than value

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`number`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_less_than(self, column: str, value: float, pct: float = 1.0):
    """
    Validation for numeric less than value

    Args:
        column (str): Column name in dataframe
        value (number): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    Rule("is_less_than", column, value, CheckDataType.NUMERIC, pct) >> self._rule
    return self

`is_negative(column, pct=1.0)`

Validation for numeric less than zero

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_negative(self, column: str, pct: float = 1.0):
    """
    Validation for numeric less than zero

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_less_than(column, 0, pct)

`is_on_friday(column, pct=1.0)`

Validates a datetime column is on Friday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_friday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Friday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_friday", column, "Fri", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_monday(column, pct=1.0)`

Validates a datetime column is on Monday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_monday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Monday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_monday", column, "Mon", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_saturday(column, pct=1.0)`

Validates a datetime column is on Saturday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_saturday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Saturday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_saturday", column, "Sat", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_schedule(column, value, pct=1.0)`

Validation of a datetime column between an hour interval

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`value`	`Tuple[int, int]`	A tuple indicating a 24hr day interval. i.e. (9,17) for 9am to 5pm	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_schedule(self, column: str, value: Tuple[Any], pct: float = 1.0):
    """
    Validation of a datetime column between an hour interval

    Args:
        column (str): Column name in the dataframe
        value (Tuple[int,int]): A tuple indicating a 24hr day interval. i.e. (9,17) for 9am to 5pm
        pct (float): The threshold percentage required to pass
    """
    (
        Rule("is_on_schedule", column, value, CheckDataType.TIMESTAMP, pct)
        >> self._rule
    )
    return self

`is_on_sunday(column, pct=1.0)`

Validates a datetime column is on Sunday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_sunday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Sunday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_sunday", column, "Sun", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_thursday(column, pct=1.0)`

Validates a datetime column is on Thursday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_thursday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Thursday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_thursday", column, "Thu", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_tuesday(column, pct=1.0)`

Validates a datetime column is on Tuesday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_tuesday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Tuesday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_tuesday", column, "Tue", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_wednesday(column, pct=1.0)`

Validates a datetime column is on Wednesday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_wednesday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is on Wednesday

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_wednesday", column, "Wed", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_weekday(column, pct=1.0)`

Validates a datetime column is in a Mon-Fri time range

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_weekday(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is in a Mon-Fri time range

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_weekday", column, "Mon-Fri", CheckDataType.DATE, pct) >> self._rule
    return self

`is_on_weekend(column, pct=1.0)`

Validates a datetime column is in a Sat-Sun time range

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_on_weekend(self, column: str, pct: float = 1.0):
    """
    Validates a datetime column is in a Sat-Sun time range

    Args:
        column (str): Column name in the dataframe
        pct (float): The threshold percentage required to pass
    """
    Rule("is_on_weekend", column, "Sat-Sun", CheckDataType.DATE, pct) >> self._rule
    return self

`is_positive(column, pct=1.0)`

Validation for numeric greater than zero

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_positive(self, column: str, pct: float = 1.0):
    """
    Validation for numeric greater than zero

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_greater_than(column, 0, pct)

`is_primary_key(column, pct=1.0)`

Validation for unique values in column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_primary_key(self, column: str, pct: float = 1.0):
    """
    Validation for unique values in column

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "is_unique",
            column,
            "N/A",
            CheckDataType.AGNOSTIC,
            pct,
            options={"name": "is_primary_key"},
        )
        >> self._rule
    )
    return self

`is_t_minus_1(column, pct=1.0)`

Validate that date is yesterday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_t_minus_1(self, column: str, pct: float = 1.0):
    """
    Validate that date is yesterday

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_t_minus_n(column, 1, pct, options={"name": "is_t_minus_1"})

`is_t_minus_2(column, pct=1.0)`

Validate that date is 2 days ago

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_t_minus_2(self, column: str, pct: float = 1.0):
    """
    Validate that date is 2 days ago

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_t_minus_n(column, 2, pct, options={"name": "is_t_minus_2"})

`is_t_minus_3(column, pct=1.0)`

Validate that date is 3 days ago

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_t_minus_3(self, column: str, pct: float = 1.0):
    """
    Validate that date is 3 days ago

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_t_minus_n(column, 3, pct, options={"name": "is_t_minus_3"})

`is_t_minus_n(column, value, pct=1.0, options={'name': 'is_t_minus_n'})`

Validate that date is n days before the current date

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The number of days before the current date	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_t_minus_n(
    self,
    column: str,
    value: int,
    pct: float = 1.0,
    options: Dict[str, str] = {"name": "is_t_minus_n"},
):
    """
    Validate that date is `n` days before the current date

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The number of days before the current date
        pct (float): The threshold percentage required to pass
    """
    yesterday = datetime.utcnow() - timedelta(days=value)
    return self.is_contained_in(
        column, tuple([yesterday.strftime("%Y-%m-%d")]), pct, options=options
    )

`is_today(column, pct=1.0)`

Validate that date is today

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_today(self, column: str, pct: float = 1.0):
    """
    Validate that date is today

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_t_minus_n(column, 0, pct, options={"name": "is_today"})

`is_unique(column, pct=1.0, approximate=False, ignore_nulls=False)`

Validation for unique values in column

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`
`approximate`	`bool`	A flag to speed up computation using an approximation through maximum relative std. dev.	`False`
`ignore_nulls`	`bool`	Run drop nulls before counting	`False`

Source code in cuallee/__init__.py

def is_unique(
    self,
    column: str,
    pct: float = 1.0,
    approximate: bool = False,
    ignore_nulls: bool = False,
):
    """
    Validation for unique values in column

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
        approximate (bool): A flag to speed up computation using an approximation through maximum relative std. dev.
        ignore_nulls (bool): Run drop nulls before counting
    """
    (
        Rule(
            "is_unique",
            column,
            "N/A",
            CheckDataType.AGNOSTIC,
            pct,
            options={"approximate": approximate, "ignore_nulls": ignore_nulls},
        )
        >> self._rule
    )
    return self

`is_yesterday(column, pct=1.0)`

Validate that date is yesterday

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def is_yesterday(self, column: str, pct: float = 1.0):
    """
    Validate that date is yesterday

    Args:
        column (str): Column name in dataframe
        pct (float): The threshold percentage required to pass
    """
    return self.is_t_minus_n(column, 1, pct, options={"name": "is_yesterday"})

`not_contained_in(column, value, pct=1.0)`

Validation of column value not in set of given values

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def not_contained_in(
    self,
    column: str,
    value: Union[List, Tuple],
    pct: float = 1.0,
):
    """
    Validation of column value not in set of given values

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    (
        Rule("not_contained_in", column, value, CheckDataType.AGNOSTIC, pct)
        >> self._rule
    )

    return self

`not_in(column, value, pct=1.0)`

Validation of column value not in set of given values

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in dataframe	required
`value`	`List[str, number, date]`	The condition for the column to match	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def not_in(self, column: str, value: Tuple[str, int, float], pct: float = 1.0):
    """
    Validation of column value not in set of given values

    Args:
        column (str): Column name in dataframe
        value (List[str,number,date]): The condition for the column to match
        pct (float): The threshold percentage required to pass
    """
    return self.not_contained_in(column, value, pct)

`ok(dataframe)`

True when all checks passed

Source code in cuallee/__init__.py

def ok(self, dataframe: Any) -> bool:
    """True when all checks passed"""
    return self.validate(dataframe, ok=True)

`satisfies(column, predicate, pct=1.0, options={})`

Validation of a column satisfying a SQL-like predicate

Parameters:

Name	Type	Description	Default
`column`	`str`	Column name in the dataframe	required
`predicate`	`str`	A predicate written in SQL-like syntax	required
`pct`	`float`	The threshold percentage required to pass	`1.0`

Source code in cuallee/__init__.py

def satisfies(
    self,
    column: str,
    predicate: str,
    pct: float = 1.0,
    options: Dict[str, str] = {},
):
    """
    Validation of a column satisfying a SQL-like predicate

    Args:
        column (str): Column name in the dataframe
        predicate (str): A predicate written in SQL-like syntax
        pct (float): The threshold percentage required to pass
    """
    (
        Rule(
            "satisfies",
            column,
            predicate,
            CheckDataType.AGNOSTIC,
            pct,
            options=options,
        )
        >> self._rule
    )
    return self

`validate(dataframe, ok=False)`

Compute all rules in this check for specific data frame

Parameters:

Name	Type	Description	Default
`dataframe`	`Union[pyspark, snowpark, pandas, polars, duckdb, bigquery]`	A dataframe object	required

Source code in cuallee/__init__.py

def validate(self, dataframe: Any, ok: bool = False):
    """
    Compute all rules in this check for specific data frame

    Args:
        dataframe (Union[pyspark,snowpark,pandas,polars,duckdb,bigquery]): A dataframe object
    """

    # Stop execution if the there is no rules in the check
    assert not self.empty, "Check is empty. Try adding some rules?"

    self.dtype = first(re.match(r".*'(.*)'", str(type(dataframe))).groups())
    match self.dtype:
        case self.dtype if "pyspark" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.pyspark_validation"
            )
        case self.dtype if "pandas" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.pandas_validation"
            )
        case self.dtype if "snowpark" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.snowpark_validation"
            )
        case self.dtype if "polars" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.polars_validation"
            )
        case self.dtype if "duckdb" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.duckdb_validation"
            )
        case self.dtype if "bigquery" in self.dtype:
            self.compute_engine = importlib.import_module(
                "cuallee.bigquery_validation"
            )
        case self.dtype if "daft" in self.dtype:
            self.compute_engine = importlib.import_module("cuallee.daft_validation")
        case _:
            raise NotImplementedError(
                f"{self.dtype} is not yet implemented in cuallee"
            )

    assert self.compute_engine.validate_data_types(
        self.rules, dataframe
    ), "Invalid data types between rules and dataframe"

    if ok:
        result = self.compute_engine.ok(self, dataframe)
    else:
        result = self.compute_engine.summary(self, dataframe)
    return result

Check

empty property

keys property

rules property

sum property

__init__(level=0, name='cuallee.check', *, execution_date=datetime.now(timezone.utc), table_name=None, session=None)

_remove_rule_generic(key)

add_rule(method, *arg, **kwargs)

adjust_rule_coverage(rule_index, rule_coverage)

are_complete(column, pct=1.0)

are_unique(column, pct=1.0)

delete_rule_by_attribute(rule_attribute, values)

delete_rule_by_key(keys)

has_cardinality(column, value)

has_correlation(column_left, column_right, value)

has_entropy(column, value, tolerance=0.01)

has_infogain(column, pct=1.0)

has_max(column, value)

has_max_by(column_source, column_target, value)

has_mean(column, value)

has_min(column, value)

has_min_by(column_source, column_target, value)

has_pattern(column, value, pct=1.0, options={})

has_percentile(column, value, percentile, precision=10000)

has_std(column, value)

has_sum(column, value)

has_workflow(column_group, column_event, column_order, edges, pct=1.0)

is_between(column, value, pct=1.0)

is_complete(column, pct=1.0)

is_composite_key(column, pct=1.0)

is_contained_in(column, value, pct=1.0, options={})

is_custom(column, fn=None, pct=1.0, options={})

is_daily(column, value=None, pct=1.0)

is_empty(column, pct=1.0)

is_equal_than(column, value, pct=1.0)

is_greater_or_equal_than(column, value, pct=1.0)

is_greater_than(column, value, pct=1.0)

is_in(column, value, pct=1.0)

is_in_billions(column, pct=1.0)

is_in_millions(column, pct=1.0)

is_inside_interquartile_range(column, value=[0.25, 0.75], pct=1.0)

is_legit(column, pct=1.0)

is_less_or_equal_than(column, value, pct=1.0)

is_less_than(column, value, pct=1.0)

is_negative(column, pct=1.0)

is_on_friday(column, pct=1.0)

is_on_monday(column, pct=1.0)

is_on_saturday(column, pct=1.0)

is_on_schedule(column, value, pct=1.0)

is_on_sunday(column, pct=1.0)

is_on_thursday(column, pct=1.0)

is_on_tuesday(column, pct=1.0)

is_on_wednesday(column, pct=1.0)

is_on_weekday(column, pct=1.0)

is_on_weekend(column, pct=1.0)

is_positive(column, pct=1.0)

is_primary_key(column, pct=1.0)

is_t_minus_1(column, pct=1.0)

is_t_minus_2(column, pct=1.0)

is_t_minus_3(column, pct=1.0)

is_t_minus_n(column, value, pct=1.0, options={'name': 'is_t_minus_n'})

is_today(column, pct=1.0)

is_unique(column, pct=1.0, approximate=False, ignore_nulls=False)

is_yesterday(column, pct=1.0)

not_contained_in(column, value, pct=1.0)

not_in(column, value, pct=1.0)

ok(dataframe)

satisfies(column, predicate, pct=1.0, options={})

validate(dataframe, ok=False)

`empty` `property`

`keys` `property`

`rules` `property`

`sum` `property`

`init(level=0, name='cuallee.check', *, execution_date=datetime.now(timezone.utc), table_name=None, session=None)`

`_remove_rule_generic(key)`

`add_rule(method, *arg, **kwargs)`

`adjust_rule_coverage(rule_index, rule_coverage)`

`are_complete(column, pct=1.0)`

`are_unique(column, pct=1.0)`

`delete_rule_by_attribute(rule_attribute, values)`

`delete_rule_by_key(keys)`

`has_cardinality(column, value)`

`has_correlation(column_left, column_right, value)`

`has_entropy(column, value, tolerance=0.01)`

`has_infogain(column, pct=1.0)`

`has_max(column, value)`

`has_max_by(column_source, column_target, value)`

`has_mean(column, value)`

`has_min(column, value)`

`has_min_by(column_source, column_target, value)`

`has_pattern(column, value, pct=1.0, options={})`

`has_percentile(column, value, percentile, precision=10000)`

`has_std(column, value)`

`has_sum(column, value)`

`has_workflow(column_group, column_event, column_order, edges, pct=1.0)`

`is_between(column, value, pct=1.0)`

`is_complete(column, pct=1.0)`

`is_composite_key(column, pct=1.0)`

`is_contained_in(column, value, pct=1.0, options={})`

`is_custom(column, fn=None, pct=1.0, options={})`

`is_daily(column, value=None, pct=1.0)`

`is_empty(column, pct=1.0)`

`is_equal_than(column, value, pct=1.0)`

`is_greater_or_equal_than(column, value, pct=1.0)`

`is_greater_than(column, value, pct=1.0)`

`is_in(column, value, pct=1.0)`

`is_in_billions(column, pct=1.0)`

`is_in_millions(column, pct=1.0)`

`is_inside_interquartile_range(column, value=[0.25, 0.75], pct=1.0)`

`is_legit(column, pct=1.0)`

`is_less_or_equal_than(column, value, pct=1.0)`

`is_less_than(column, value, pct=1.0)`

`is_negative(column, pct=1.0)`

`is_on_friday(column, pct=1.0)`

`is_on_monday(column, pct=1.0)`

`is_on_saturday(column, pct=1.0)`

`is_on_schedule(column, value, pct=1.0)`

`is_on_sunday(column, pct=1.0)`

`is_on_thursday(column, pct=1.0)`

`is_on_tuesday(column, pct=1.0)`

`is_on_wednesday(column, pct=1.0)`

`is_on_weekday(column, pct=1.0)`

`is_on_weekend(column, pct=1.0)`

`is_positive(column, pct=1.0)`

`is_primary_key(column, pct=1.0)`

`is_t_minus_1(column, pct=1.0)`

`is_t_minus_2(column, pct=1.0)`

`is_t_minus_3(column, pct=1.0)`

`is_t_minus_n(column, value, pct=1.0, options={'name': 'is_t_minus_n'})`

`is_today(column, pct=1.0)`

`is_unique(column, pct=1.0, approximate=False, ignore_nulls=False)`

`is_yesterday(column, pct=1.0)`

`not_contained_in(column, value, pct=1.0)`

`not_in(column, value, pct=1.0)`

`ok(dataframe)`

`satisfies(column, predicate, pct=1.0, options={})`

`validate(dataframe, ok=False)`