-
Notifications
You must be signed in to change notification settings - Fork 4k
GH-48832: [R] Fix crash with zero-length POSIXct tzone attribute #48854
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Add safety check for empty tzone attribute in InferArrowTypeFromVector to prevent out-of-bounds access when handling zero-length POSIXct vectors.
…t/Ubuntu Added Development Tool Requirements section to cpp/development.rst documenting the required versions: - clang-format 14.0.6+ - pre-commit 2.17.0+ - Ubuntu 22.04 LTS+
jonkeane
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for submitting the change. The devtooling documentation should go in a separate PR IMO.
Would you mind moving the tests to where we test other examples of POSIXct? The approach to solve it looks solid, but CI will determine if we are good.
| test_that("zero-length POSIXct with empty tzone attribute handled safely", { | ||
| x <- as.POSIXct(character(0)) | ||
| attr(x, "tzone") <- character(0) | ||
|
|
||
| # Should not crash or error | ||
| expect_error(type(x), NA) | ||
|
|
||
| # Should default to no timezone (or empty string which effectively means local/no-tz behavior in arrow) | ||
| # When sys.timezone is picked up it might vary, but we just check it doesn't crash. | ||
| # If it picks up Sys.timezone(), checking exact equality might be flaky across environments if not mocked. | ||
| # So we primarily check for no error. | ||
|
|
||
| # Also check write_parquet survival | ||
| tf <- tempfile() | ||
| on.exit(unlink(tf)) | ||
| expect_error(write_parquet(data.frame(x = x), tf), NA) | ||
| expect_true(file.exists(tf)) | ||
| }) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would you mind putting this test in somewhere around
arrow/r/tests/testthat/test-Array.R
Lines 279 to 325 in 962d051
| test_that("array supports POSIXct (ARROW-3340)", { | |
| times <- lubridate::ymd_hms("2018-10-07 19:04:05") + 1:10 | |
| expect_array_roundtrip(times, timestamp("us", "UTC")) | |
| times[5] <- NA | |
| expect_array_roundtrip(times, timestamp("us", "UTC")) | |
| times2 <- lubridate::ymd_hms("2018-10-07 19:04:05", tz = "America/New_York") + 1:10 | |
| expect_array_roundtrip(times2, timestamp("us", "America/New_York")) | |
| }) | |
| test_that("array uses local timezone for POSIXct without timezone", { | |
| withr::with_envvar(c(TZ = ""), { | |
| times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10 | |
| expect_equal(attr(times, "tzone"), NULL) | |
| expect_array_roundtrip(times, timestamp("us", Sys.timezone())) | |
| # Also test the INTSXP code path | |
| skip("Ingest_POSIXct only implemented for REALSXP") | |
| times_int <- as.integer(times) | |
| attributes(times_int) <- attributes(times) | |
| expect_array_roundtrip(times_int, timestamp("us", "")) | |
| }) | |
| # If there is a timezone set, we record that | |
| withr::with_timezone("Pacific/Marquesas", { | |
| times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10 | |
| expect_equal(attr(times, "tzone"), "Pacific/Marquesas") | |
| expect_array_roundtrip(times, timestamp("us", "Pacific/Marquesas")) | |
| times_with_tz <- strptime( | |
| "2019-02-03 12:34:56", | |
| format = "%Y-%m-%d %H:%M:%S", | |
| tz = "Asia/Katmandu" | |
| ) + | |
| 1:10 | |
| expect_equal(attr(times, "tzone"), "Asia/Katmandu") | |
| expect_array_roundtrip(times, timestamp("us", "Asia/Katmandu")) | |
| }) | |
| # and although the TZ is NULL in R, we set it to the Sys.timezone() | |
| withr::with_timezone(NA, { | |
| times <- strptime("2019-02-03 12:34:56", format = "%Y-%m-%d %H:%M:%S") + 1:10 | |
| expect_equal(attr(times, "tzone"), NULL) | |
| expect_array_roundtrip(times, timestamp("us", Sys.timezone())) | |
| }) | |
| }) |
and possibly for the parquet section, put it somewhere near
arrow/r/tests/testthat/test-parquet.R
Lines 119 to 153 in 962d051
| test_that("write_parquet() can truncate timestamps", { | |
| tab <- Table$create(x1 = as.POSIXct("2020/06/03 18:00:00", tz = "UTC")) | |
| expect_type_equal(tab$x1, timestamp("us", "UTC")) | |
| tf <- tempfile() | |
| on.exit(unlink(tf)) | |
| write_parquet(tab, tf, coerce_timestamps = "ms", allow_truncated_timestamps = TRUE) | |
| new <- read_parquet(tf, as_data_frame = FALSE) | |
| expect_type_equal(new$x1, timestamp("ms", "UTC")) | |
| expect_equal(as.data.frame(tab), as.data.frame(new)) | |
| }) | |
| test_that("make_valid_parquet_version()", { | |
| expect_equal( | |
| make_valid_parquet_version("1.0"), | |
| ParquetVersionType$PARQUET_1_0 | |
| ) | |
| expect_equal( | |
| make_valid_parquet_version("2.4"), | |
| ParquetVersionType$PARQUET_2_4 | |
| ) | |
| expect_equal( | |
| make_valid_parquet_version("2.6"), | |
| ParquetVersionType$PARQUET_2_6 | |
| ) | |
| expect_equal( | |
| make_valid_parquet_version("latest"), | |
| ParquetVersionType$PARQUET_2_6 | |
| ) | |
| expect_equal(make_valid_parquet_version(1), ParquetVersionType$PARQUET_1_0) | |
| expect_equal(make_valid_parquet_version(1.0), ParquetVersionType$PARQUET_1_0) | |
| expect_equal(make_valid_parquet_version(2.4), ParquetVersionType$PARQUET_2_4) | |
| }) |
Rationale for this change
Fixes crash when writing zero-length POSIXct vectors with empty tzone attributes to Parquet in R 4.5.2.
What changes are included in this PR?
Are these changes tested?
Yes, new test case reproduces the issue and verifies the fix.
Closes #48832