Polars Custom Expression Namespace

Today I explored how to register a custom expression namespace in Polars. This feature turned out to be super helpful for solving a common problem I run into when building tables or plots—colorizing rows based on their row index.

Here is the code snippet:

Show full code

from typing import Any

import polars as pl
from great_tables import GT, loc, style


@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def __init__(self, expr: pl.Expr) -> None:
        self._expr = expr

    def _mod_expr(self, n: int) -> pl.Expr:
        return pl.int_range(pl.len(), dtype=pl.UInt32).mod(n)

    def binarize(
        self, lit1: str, lit2: str, name: str = "binarized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(2)
        return (
            pl.when(mod_expr.eq(0))
            .then(pl.lit(lit1))
            .otherwise(pl.lit(lit2))
            .alias(name)
        )

    def trinarize(
        self, lit1: str, lit2: str, lit3: str, name: str = "trinarized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(3)
        return (
            pl.when(mod_expr.eq(0))
            .then(pl.lit(lit1))
            .when(mod_expr.eq(1))
            .then(pl.lit(lit2))
            .otherwise(pl.lit(lit3))
            .alias(name)
        )

    def bucketize(
        self, lits: list[Any], name: str = "bucketized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(len(lits))

        # first
        expr = pl.when(mod_expr.eq(0)).then(pl.lit(lits[0]))

        # middles
        for i, one_lit in enumerate(lits[1:-1], start=1):
            expr = expr.when(mod_expr.eq(i)).then(pl.lit(one_lit))

        # last
        expr = expr.otherwise(pl.lit(lits[-1]))
        return expr.alias(name)


df = (
    pl.DataFrame({"n": [100, 50, 72, 83, 97, 42, 20, 51, 77]})
    .with_row_index(offset=1)
    .with_columns(
        pl.col("").spt.binarize("lightblue", "papayawhip"),
        pl.col("").spt.trinarize("one", "two", "three"),
        pl.col("").spt.bucketize([1, 2, 3, 4]),
    )
)

(
    GT(df)
    .tab_style(style=style.fill(pl.col("binarized")), locations=loc.body())
    .opt_stylize(style=6)
)

Step-by-Step Breakdown

Registering the Namespace

We use pl.api.register_expr_namespace() to attach our class to the spt namespace. Once registered, we can call our methods like this: pl.col("any").spt.binarize(...).

@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def __init__(self, expr: pl.Expr) -> None:
        self._expr = expr

Helper: Row-Index Based Modulo Expression

To assign values based on row position, we need a way to refer to the row index inside an expression. Polars provides a trick using pl.int_range(pl.len()), as shown in pl.DataFrame.with_row_index().

@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def _mod_expr(self, n: int) -> pl.Expr:
        return pl.int_range(pl.len(), dtype=pl.UInt32).mod(n)

`binarize`: Two Groups

This method maps alternating rows into two categories using pl.when().then().otherwise().

@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def binarize(
        self, lit1: str, lit2: str, name: str = "binarized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(2)
        return (
            pl.when(mod_expr.eq(0))
            .then(pl.lit(lit1))
            .otherwise(pl.lit(lit2))
            .alias(name)
        )

`trinarize`: Three Groups

Similar to binarize, but splits the rows into three groups using two when().then() branches before the final otherwise().

@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def trinarize(
        self, lit1: str, lit2: str, lit3: str, name: str = "trinarized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(3)
        return (
            pl.when(mod_expr.eq(0))
            .then(pl.lit(lit1))
            .when(mod_expr.eq(1))
            .then(pl.lit(lit2))
            .otherwise(pl.lit(lit3))
            .alias(name)
        )

`bucketize`: N Groups

A generalized version of the above, which dynamically assigns values from a list across n groups:

@pl.api.register_expr_namespace("spt")
class DiscreteSplitter:
    def bucketize(
        self, lits: list[Any], name: str = "bucketized"
    ) -> pl.Expr:
        mod_expr = self._mod_expr(len(lits))

        # first
        expr = pl.when(mod_expr.eq(0)).then(pl.lit(lits[0]))

        # middles
        for i, one_lit in enumerate(lits[1:-1], start=1):
            expr = expr.when(mod_expr.eq(i)).then(pl.lit(one_lit))

        # last
        expr = expr.otherwise(pl.lit(lits[-1]))
        return expr.alias(name)

Example Usage

Here’s a simple example that demonstrates how the custom namespace works in practice:

df = (
    pl.DataFrame({"n": [100, 50, 72, 83, 97, 42, 20, 51, 77]})
    .with_row_index(offset=1)
    .with_columns(
        pl.col("").spt.binarize("lightblue", "papayawhip"),
        pl.col("").spt.trinarize("one", "two", "three"),
        pl.col("").spt.bucketize([1, 2, 3, 4]),
    )
)

This produces the following DataFrame:

shape: (9, 5)
┌───────┬─────┬────────────┬────────────┬────────────┐
│ index ┆ n   ┆ binarized  ┆ trinarized ┆ bucketized │
│ ---   ┆ --- ┆ ---        ┆ ---        ┆ ---        │
│ u32   ┆ i64 ┆ str        ┆ str        ┆ i32        │
╞═══════╪═════╪════════════╪════════════╪════════════╡
│ 1     ┆ 100 ┆ lightblue  ┆ one        ┆ 1          │
│ 2     ┆ 50  ┆ papayawhip ┆ two        ┆ 2          │
│ 3     ┆ 72  ┆ lightblue  ┆ three      ┆ 3          │
│ 4     ┆ 83  ┆ papayawhip ┆ one        ┆ 4          │
│ 5     ┆ 97  ┆ lightblue  ┆ two        ┆ 1          │
│ 6     ┆ 42  ┆ papayawhip ┆ three      ┆ 2          │
│ 7     ┆ 20  ┆ lightblue  ┆ one        ┆ 3          │
│ 8     ┆ 51  ┆ papayawhip ┆ two        ┆ 4          │
│ 9     ┆ 77  ┆ lightblue  ┆ three      ┆ 1          │
└───────┴─────┴────────────┴────────────┴────────────┘

Note: Since the custom logic is based on the row index rather than actual column values, you can safely use pl.col("") as a placeholder when calling the namespace methods.

Each new column shows how rows are grouped using the row index modulo 2, 3, or 4—useful for highlighting patterns or applying styling.

For instance, you can use the binarized column with Great Tables like this:

(
    GT(df)
    .tab_style(style=style.fill(pl.col("binarized")), locations=loc.body())
    .opt_stylize(style=6)
)

Conclusion

Registering a custom expression namespace in Polars is a powerful way to encapsulate and reuse logic across your codebase. In this post, we created a DiscreteSplitter class to simplify index-based grouping, enabling operations like binarize, trinarize, and bucketize. This approach keeps your expressions clean and composable, especially when generating tables or plots that require styling based on row position.

It’s also worth noting that Polars supports similar registration for Series, LazyFrame, and DataFrame objects—check out the official documentation for more details.

Remark

Here’s a rough draft showing how to achieve a similar effect using the DataFrame namespace. I might revisit and refine this approach in the future.

Show full code

from typing import Any

import polars as pl
from great_tables import GT, loc, style


@pl.api.register_dataframe_namespace("spt")
class DiscreteSplitter:
    def __init__(self, df: pl.DataFrame) -> None:
        self._df = df
        self._mod_colname = "mod"
        self._mod_col = pl.col(self._mod_colname)
        self._idx_colname = "index"
        self._idx_col = pl.col(self._idx_colname)

    def _get_df(self, n: int) -> pl.DataFrame:
        return self._df.with_row_index(self._idx_colname).with_columns(
            self._idx_col.mod(n).alias(self._mod_colname)
        )

    def _get_final_df(self, n: int, expr: pl.Expr) -> pl.DataFrame:
        return (
            self._get_df(n)
            .with_columns(expr)
            .drop([self._idx_colname, self._mod_colname])
        )

    def binarize(
        self, lit1: str, lit2: str, name: str = "binarized"
    ) -> pl.DataFrame:
        n = 2

        expr = (
            pl.when(self._mod_col.eq(0))
            .then(pl.lit(lit1))
            .otherwise(pl.lit(lit2))
            .alias(name)
        )

        self._df = self._get_final_df(n, expr)
        return self._df

    def trinarize(
        self, lit1: str, lit2: str, lit3: str, name: str = "trinarized"
    ) -> pl.DataFrame:
        n = 3

        expr = (
            pl.when(self._mod_col.eq(0))
            .then(pl.lit(lit1))
            .when(self._mod_col.eq(1))
            .then(pl.lit(lit2))
            .otherwise(pl.lit(lit3))
            .alias(name)
        )

        self._df = self._get_final_df(n, expr)
        return self._df

    def bucketize(
        self, lits: list[Any], name: str = "bucketized"
    ) -> pl.DataFrame:
        n = len(lits)

        # first
        expr = pl.when(self._mod_col.eq(0)).then(pl.lit(lits[0]))

        # middles
        for i, one_lit in enumerate(lits[1:-1], start=1):
            expr = expr.when(self._mod_col.eq(i)).then(pl.lit(one_lit))

        # last
        expr = expr.otherwise(pl.lit(lits[-1]))

        # alias
        expr = expr.alias(name)

        self._df = self._get_final_df(n, expr)
        return self._df


df = (
    pl.DataFrame({"n": [100, 50, 72, 83, 97, 42, 20, 51, 77]})
    .spt.binarize("lightblue", "papayawhip")
    .spt.trinarize("one", "two", "three")
    .spt.bucketize([1, 2, 3, 4])
    .with_row_index(offset=1)
)

(
    GT(df)
    .tab_style(style=style.fill(pl.col("binarized")), locations=loc.body())
    .opt_stylize(style=6)
)

Disclaimer

This post was drafted by me, with AI assistance to refine the content.

Step-by-Step Breakdown

Registering the Namespace

Helper: Row-Index Based Modulo Expression

binarize: Two Groups

trinarize: Three Groups

bucketize: N Groups

Example Usage

Conclusion

Remark

`binarize`: Two Groups

`trinarize`: Three Groups

`bucketize`: N Groups