From 38322b3a38d45005804721d4fe4ce9796470c2d4 Mon Sep 17 00:00:00 2001 From: "Adisa Mubarak (AdMub)" <99817240+AdMub@users.noreply.github.com> Date: Thu, 22 Jan 2026 06:14:27 +0000 Subject: [PATCH 1/2] docs: Add warning to first_value about usage in select vs aggregate Clarifies that aggregate functions like first_value must be used within .aggregate() and not .select(). Closes #1300. --- python/datafusion/functions.py | 5 ++++ reproduce_1300.py | 45 ++++++++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 reproduce_1300.py diff --git a/python/datafusion/functions.py b/python/datafusion/functions.py index 7ae59c000..89645104a 100644 --- a/python/datafusion/functions.py +++ b/python/datafusion/functions.py @@ -2268,6 +2268,11 @@ def first_value( ) -> Expr: """Returns the first value in a group of values. + .. note:: + This is an aggregate function. It must be used within + :py:meth:`~datafusion.dataframe.DataFrame.aggregate` and generally cannot be + used with :py:meth:`~datafusion.dataframe.DataFrame.select`. + This aggregate function will return the first value in the partition. If using the builder functions described in ref:`_aggregation` this function ignores diff --git a/reproduce_1300.py b/reproduce_1300.py new file mode 100644 index 000000000..55285aa2b --- /dev/null +++ b/reproduce_1300.py @@ -0,0 +1,45 @@ +import datafusion as dfn +from datafusion import lit, col, functions as F +from datafusion.expr import Window, WindowFrame + +def main() -> None: + # Create the context and data + ctx = dfn.SessionContext() + df = ctx.from_pydict( + {"any_row": list(range(10))}, + ) + + # Add a column of ones + df = df.select( + "any_row", + lit(1).alias("ones"), + ) + + # Perform Window functions + df = df.select( + "any_row", + F.sum(col("ones"))\ + .over(Window(window_frame=WindowFrame("rows", None, 0), order_by=col("any_row").sort(ascending=True))) \ + .alias("forward_row_sum"), + F.sum(col("ones"))\ + .over(Window(window_frame=WindowFrame("rows", None, 0), order_by=col("any_row").sort(ascending=False))) \ + .alias("reverse_row_sum"), + ) + + # Collect the intermediate window results (this should work) + print("Collecting Window Results...") + df.collect() + + # THIS IS THE FIX TEST + print("Attempting to use .aggregate() instead of .select() ...") + + # We use an empty list [] for group_by, and the function for the aggregate + df.aggregate( + [], + [F.first_value(col("forward_row_sum"), order_by=col("any_row"))] + ).collect() + + print("Success! .aggregate() worked.") + +if __name__ == "__main__": + main() \ No newline at end of file From d7b3ab504f4e9f3d4d54039df9a458a54a598ecd Mon Sep 17 00:00:00 2001 From: "Adisa Mubarak (AdMub)" <99817240+AdMub@users.noreply.github.com> Date: Thu, 22 Jan 2026 06:16:33 +0000 Subject: [PATCH 2/2] chore: remove temporary reproduction script --- reproduce_1300.py | 45 --------------------------------------------- 1 file changed, 45 deletions(-) delete mode 100644 reproduce_1300.py diff --git a/reproduce_1300.py b/reproduce_1300.py deleted file mode 100644 index 55285aa2b..000000000 --- a/reproduce_1300.py +++ /dev/null @@ -1,45 +0,0 @@ -import datafusion as dfn -from datafusion import lit, col, functions as F -from datafusion.expr import Window, WindowFrame - -def main() -> None: - # Create the context and data - ctx = dfn.SessionContext() - df = ctx.from_pydict( - {"any_row": list(range(10))}, - ) - - # Add a column of ones - df = df.select( - "any_row", - lit(1).alias("ones"), - ) - - # Perform Window functions - df = df.select( - "any_row", - F.sum(col("ones"))\ - .over(Window(window_frame=WindowFrame("rows", None, 0), order_by=col("any_row").sort(ascending=True))) \ - .alias("forward_row_sum"), - F.sum(col("ones"))\ - .over(Window(window_frame=WindowFrame("rows", None, 0), order_by=col("any_row").sort(ascending=False))) \ - .alias("reverse_row_sum"), - ) - - # Collect the intermediate window results (this should work) - print("Collecting Window Results...") - df.collect() - - # THIS IS THE FIX TEST - print("Attempting to use .aggregate() instead of .select() ...") - - # We use an empty list [] for group_by, and the function for the aggregate - df.aggregate( - [], - [F.first_value(col("forward_row_sum"), order_by=col("any_row"))] - ).collect() - - print("Success! .aggregate() worked.") - -if __name__ == "__main__": - main() \ No newline at end of file