From d1dc4bdb2fe7f16e6da78c0930353e4a5031465a Mon Sep 17 00:00:00 2001 From: Earlopain <14981592+Earlopain@users.noreply.github.com> Date: Sat, 17 Jan 2026 20:28:01 +0100 Subject: [PATCH 1/6] [ruby/prism] Fix ripper translator for `__END__` https://github.com/ruby/prism/commit/2792ac78ca --- lib/prism/lex_compat.rb | 13 ++++--------- test/prism/fixtures/__END__.txt | 3 +++ 2 files changed, 7 insertions(+), 9 deletions(-) create mode 100644 test/prism/fixtures/__END__.txt diff --git a/lib/prism/lex_compat.rb b/lib/prism/lex_compat.rb index 46f6130357e024..b7c54178ac3193 100644 --- a/lib/prism/lex_compat.rb +++ b/lib/prism/lex_compat.rb @@ -225,14 +225,6 @@ def state end end - # Ripper doesn't include the rest of the token in the event, so we need to - # trim it down to just the content on the first line when comparing. - class EndContentToken < Token - def ==(other) # :nodoc: - [self[0], self[1], self[2][0..self[2].index("\n")], self[3]] == other - end - end - # Tokens where state should be ignored # used for :on_comment, :on_heredoc_end, :on_embexpr_end class IgnoreStateToken < Token @@ -680,7 +672,10 @@ def result token = case event when :on___end__ - EndContentToken.new([[lineno, column], event, value, lex_state]) + # Ripper doesn't include the rest of the token in the event, so we need to + # trim it down to just the content on the first line. + value = value[0..value.index("\n")] + Token.new([[lineno, column], event, value, lex_state]) when :on_comment IgnoreStateToken.new([[lineno, column], event, value, lex_state]) when :on_heredoc_end diff --git a/test/prism/fixtures/__END__.txt b/test/prism/fixtures/__END__.txt new file mode 100644 index 00000000000000..c0f4f28004cdc5 --- /dev/null +++ b/test/prism/fixtures/__END__.txt @@ -0,0 +1,3 @@ +foo +__END__ +Available in DATA constant From 6cd4549060a608d8a7e5ee0dde2c4b69b08d7f6e Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 18 Jan 2026 10:33:54 +0100 Subject: [PATCH 2/6] Optimize File.join common use case `File.join` is a hotspot for common libraries such as Zeitwerk and Bootsnap. It has a fairly flexible signature, but 99% of the time it's called with just two (or a small number of) UTF-8 strings. If we optimistically optimize for that use case we can cut down a large number of type and encoding checks, significantly speeding up the method. The one remaining expensive check we could try to optimize is `str_null_check`. Given it's common to use the same base string for joining, we could memoize it. Also we could precompute it for literal strings. ``` compare-ruby: ruby 4.1.0dev (2026-01-17T14:40:03Z master 00a3b71eaf) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-18T12:10:38Z spedup-file-join 069bab58d4) +PRISM [arm64-darwin25] warming up.... | |compare-ruby|built-ruby| |:-------------|-----------:|---------:| |two_strings | 2.475M| 9.444M| | | -| 3.82x| |many_strings | 551.975k| 2.346M| | | -| 4.25x| |array | 514.946k| 522.034k| | | -| 1.01x| |mixed | 621.236k| 633.189k| | | -| 1.02x| ``` --- benchmark/file_join.yml | 7 +++ depend | 41 +++++++++++++++++ ext/-test-/stack/depend | 1 + ext/-test-/string/depend | 3 ++ ext/objspace/depend | 1 + ext/ripper/depend | 1 + ext/socket/depend | 15 ++++++ file.c | 99 ++++++++++++++++++++++++++++++++++++---- internal/string.h | 21 +++++++++ string.c | 47 +++++++------------ 10 files changed, 197 insertions(+), 39 deletions(-) create mode 100644 benchmark/file_join.yml diff --git a/benchmark/file_join.yml b/benchmark/file_join.yml new file mode 100644 index 00000000000000..845257cf1e4a31 --- /dev/null +++ b/benchmark/file_join.yml @@ -0,0 +1,7 @@ +prelude: | + # frozen_string_literal: true +benchmark: + two_strings: File.join(__FILE__, "path") + many_strings: File.join(__FILE__, "path", "a", "b", "c", "d") + array: File.join([__FILE__, "path", "a", "b", "c", "d"]) + mixed: File.join(__FILE__, "path", "a", "b", ["c", "d"]) diff --git a/depend b/depend index cfafc77703b9fa..7372902666b97b 100644 --- a/depend +++ b/depend @@ -799,6 +799,7 @@ box.$(OBJEXT): {$(VPATH)}constant.h box.$(OBJEXT): {$(VPATH)}darray.h box.$(OBJEXT): {$(VPATH)}debug_counter.h box.$(OBJEXT): {$(VPATH)}defines.h +box.$(OBJEXT): {$(VPATH)}encindex.h box.$(OBJEXT): {$(VPATH)}encoding.h box.$(OBJEXT): {$(VPATH)}eval_intern.h box.$(OBJEXT): {$(VPATH)}id.h @@ -1250,6 +1251,7 @@ class.$(OBJEXT): {$(VPATH)}config.h class.$(OBJEXT): {$(VPATH)}constant.h class.$(OBJEXT): {$(VPATH)}debug_counter.h class.$(OBJEXT): {$(VPATH)}defines.h +class.$(OBJEXT): {$(VPATH)}encindex.h class.$(OBJEXT): {$(VPATH)}encoding.h class.$(OBJEXT): {$(VPATH)}id.h class.$(OBJEXT): {$(VPATH)}id_table.h @@ -1449,6 +1451,7 @@ compar.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h compar.$(OBJEXT): {$(VPATH)}compar.c compar.$(OBJEXT): {$(VPATH)}config.h compar.$(OBJEXT): {$(VPATH)}defines.h +compar.$(OBJEXT): {$(VPATH)}encindex.h compar.$(OBJEXT): {$(VPATH)}encoding.h compar.$(OBJEXT): {$(VPATH)}id.h compar.$(OBJEXT): {$(VPATH)}intern.h @@ -1921,6 +1924,7 @@ complex.$(OBJEXT): {$(VPATH)}config.h complex.$(OBJEXT): {$(VPATH)}constant.h complex.$(OBJEXT): {$(VPATH)}debug_counter.h complex.$(OBJEXT): {$(VPATH)}defines.h +complex.$(OBJEXT): {$(VPATH)}encindex.h complex.$(OBJEXT): {$(VPATH)}encoding.h complex.$(OBJEXT): {$(VPATH)}id.h complex.$(OBJEXT): {$(VPATH)}id_table.h @@ -2126,6 +2130,7 @@ concurrent_set.$(OBJEXT): {$(VPATH)}concurrent_set.c concurrent_set.$(OBJEXT): {$(VPATH)}config.h concurrent_set.$(OBJEXT): {$(VPATH)}debug_counter.h concurrent_set.$(OBJEXT): {$(VPATH)}defines.h +concurrent_set.$(OBJEXT): {$(VPATH)}encindex.h concurrent_set.$(OBJEXT): {$(VPATH)}encoding.h concurrent_set.$(OBJEXT): {$(VPATH)}id.h concurrent_set.$(OBJEXT): {$(VPATH)}id_table.h @@ -2364,6 +2369,7 @@ cont.$(OBJEXT): {$(VPATH)}constant.h cont.$(OBJEXT): {$(VPATH)}cont.c cont.$(OBJEXT): {$(VPATH)}debug_counter.h cont.$(OBJEXT): {$(VPATH)}defines.h +cont.$(OBJEXT): {$(VPATH)}encindex.h cont.$(OBJEXT): {$(VPATH)}encoding.h cont.$(OBJEXT): {$(VPATH)}eval_intern.h cont.$(OBJEXT): {$(VPATH)}fiber/scheduler.h @@ -4906,6 +4912,7 @@ enumerator.$(OBJEXT): {$(VPATH)}config.h enumerator.$(OBJEXT): {$(VPATH)}constant.h enumerator.$(OBJEXT): {$(VPATH)}debug_counter.h enumerator.$(OBJEXT): {$(VPATH)}defines.h +enumerator.$(OBJEXT): {$(VPATH)}encindex.h enumerator.$(OBJEXT): {$(VPATH)}encoding.h enumerator.$(OBJEXT): {$(VPATH)}enumerator.c enumerator.$(OBJEXT): {$(VPATH)}id.h @@ -5126,6 +5133,7 @@ error.$(OBJEXT): {$(VPATH)}config.h error.$(OBJEXT): {$(VPATH)}constant.h error.$(OBJEXT): {$(VPATH)}debug_counter.h error.$(OBJEXT): {$(VPATH)}defines.h +error.$(OBJEXT): {$(VPATH)}encindex.h error.$(OBJEXT): {$(VPATH)}encoding.h error.$(OBJEXT): {$(VPATH)}error.c error.$(OBJEXT): {$(VPATH)}id.h @@ -5373,6 +5381,7 @@ eval.$(OBJEXT): {$(VPATH)}config.h eval.$(OBJEXT): {$(VPATH)}constant.h eval.$(OBJEXT): {$(VPATH)}debug_counter.h eval.$(OBJEXT): {$(VPATH)}defines.h +eval.$(OBJEXT): {$(VPATH)}encindex.h eval.$(OBJEXT): {$(VPATH)}encoding.h eval.$(OBJEXT): {$(VPATH)}eval.c eval.$(OBJEXT): {$(VPATH)}eval_error.c @@ -5584,6 +5593,7 @@ file.$(OBJEXT): $(top_srcdir)/internal/array.h file.$(OBJEXT): $(top_srcdir)/internal/class.h file.$(OBJEXT): $(top_srcdir)/internal/compilers.h file.$(OBJEXT): $(top_srcdir)/internal/dir.h +file.$(OBJEXT): $(top_srcdir)/internal/encoding.h file.$(OBJEXT): $(top_srcdir)/internal/error.h file.$(OBJEXT): $(top_srcdir)/internal/file.h file.$(OBJEXT): $(top_srcdir)/internal/gc.h @@ -5865,6 +5875,7 @@ gc.$(OBJEXT): {$(VPATH)}darray.h gc.$(OBJEXT): {$(VPATH)}debug.h gc.$(OBJEXT): {$(VPATH)}debug_counter.h gc.$(OBJEXT): {$(VPATH)}defines.h +gc.$(OBJEXT): {$(VPATH)}encindex.h gc.$(OBJEXT): {$(VPATH)}encoding.h gc.$(OBJEXT): {$(VPATH)}eval_intern.h gc.$(OBJEXT): {$(VPATH)}gc.c @@ -6373,6 +6384,7 @@ hash.$(OBJEXT): {$(VPATH)}config.h hash.$(OBJEXT): {$(VPATH)}constant.h hash.$(OBJEXT): {$(VPATH)}debug_counter.h hash.$(OBJEXT): {$(VPATH)}defines.h +hash.$(OBJEXT): {$(VPATH)}encindex.h hash.$(OBJEXT): {$(VPATH)}encoding.h hash.$(OBJEXT): {$(VPATH)}hash.c hash.$(OBJEXT): {$(VPATH)}hash.rbinc @@ -7203,6 +7215,7 @@ io_buffer.$(OBJEXT): {$(VPATH)}backward/2/stdalign.h io_buffer.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h io_buffer.$(OBJEXT): {$(VPATH)}config.h io_buffer.$(OBJEXT): {$(VPATH)}defines.h +io_buffer.$(OBJEXT): {$(VPATH)}encindex.h io_buffer.$(OBJEXT): {$(VPATH)}encoding.h io_buffer.$(OBJEXT): {$(VPATH)}fiber/scheduler.h io_buffer.$(OBJEXT): {$(VPATH)}id.h @@ -7454,6 +7467,7 @@ iseq.$(OBJEXT): {$(VPATH)}config.h iseq.$(OBJEXT): {$(VPATH)}constant.h iseq.$(OBJEXT): {$(VPATH)}debug_counter.h iseq.$(OBJEXT): {$(VPATH)}defines.h +iseq.$(OBJEXT): {$(VPATH)}encindex.h iseq.$(OBJEXT): {$(VPATH)}encoding.h iseq.$(OBJEXT): {$(VPATH)}eval_intern.h iseq.$(OBJEXT): {$(VPATH)}id.h @@ -7702,6 +7716,7 @@ jit.$(OBJEXT): {$(VPATH)}config.h jit.$(OBJEXT): {$(VPATH)}constant.h jit.$(OBJEXT): {$(VPATH)}debug_counter.h jit.$(OBJEXT): {$(VPATH)}defines.h +jit.$(OBJEXT): {$(VPATH)}encindex.h jit.$(OBJEXT): {$(VPATH)}encoding.h jit.$(OBJEXT): {$(VPATH)}id.h jit.$(OBJEXT): {$(VPATH)}id_table.h @@ -7956,6 +7971,7 @@ load.$(OBJEXT): {$(VPATH)}constant.h load.$(OBJEXT): {$(VPATH)}darray.h load.$(OBJEXT): {$(VPATH)}defines.h load.$(OBJEXT): {$(VPATH)}dln.h +load.$(OBJEXT): {$(VPATH)}encindex.h load.$(OBJEXT): {$(VPATH)}encoding.h load.$(OBJEXT): {$(VPATH)}eval_intern.h load.$(OBJEXT): {$(VPATH)}id.h @@ -9979,6 +9995,7 @@ numeric.$(OBJEXT): {$(VPATH)}builtin.h numeric.$(OBJEXT): {$(VPATH)}config.h numeric.$(OBJEXT): {$(VPATH)}constant.h numeric.$(OBJEXT): {$(VPATH)}defines.h +numeric.$(OBJEXT): {$(VPATH)}encindex.h numeric.$(OBJEXT): {$(VPATH)}encoding.h numeric.$(OBJEXT): {$(VPATH)}id.h numeric.$(OBJEXT): {$(VPATH)}id_table.h @@ -10200,6 +10217,7 @@ object.$(OBJEXT): {$(VPATH)}config.h object.$(OBJEXT): {$(VPATH)}constant.h object.$(OBJEXT): {$(VPATH)}debug_counter.h object.$(OBJEXT): {$(VPATH)}defines.h +object.$(OBJEXT): {$(VPATH)}encindex.h object.$(OBJEXT): {$(VPATH)}encoding.h object.$(OBJEXT): {$(VPATH)}id.h object.$(OBJEXT): {$(VPATH)}id_table.h @@ -10418,6 +10436,7 @@ pack.$(OBJEXT): {$(VPATH)}builtin.h pack.$(OBJEXT): {$(VPATH)}config.h pack.$(OBJEXT): {$(VPATH)}constant.h pack.$(OBJEXT): {$(VPATH)}defines.h +pack.$(OBJEXT): {$(VPATH)}encindex.h pack.$(OBJEXT): {$(VPATH)}encoding.h pack.$(OBJEXT): {$(VPATH)}id.h pack.$(OBJEXT): {$(VPATH)}id_table.h @@ -10644,6 +10663,7 @@ parse.$(OBJEXT): {$(VPATH)}config.h parse.$(OBJEXT): {$(VPATH)}constant.h parse.$(OBJEXT): {$(VPATH)}defines.h parse.$(OBJEXT): {$(VPATH)}defs/keywords +parse.$(OBJEXT): {$(VPATH)}encindex.h parse.$(OBJEXT): {$(VPATH)}encoding.h parse.$(OBJEXT): {$(VPATH)}id.h parse.$(OBJEXT): {$(VPATH)}id_table.h @@ -12125,6 +12145,7 @@ proc.$(OBJEXT): {$(VPATH)}config.h proc.$(OBJEXT): {$(VPATH)}constant.h proc.$(OBJEXT): {$(VPATH)}debug_counter.h proc.$(OBJEXT): {$(VPATH)}defines.h +proc.$(OBJEXT): {$(VPATH)}encindex.h proc.$(OBJEXT): {$(VPATH)}encoding.h proc.$(OBJEXT): {$(VPATH)}eval_intern.h proc.$(OBJEXT): {$(VPATH)}id.h @@ -12356,6 +12377,7 @@ process.$(OBJEXT): {$(VPATH)}constant.h process.$(OBJEXT): {$(VPATH)}debug_counter.h process.$(OBJEXT): {$(VPATH)}defines.h process.$(OBJEXT): {$(VPATH)}dln.h +process.$(OBJEXT): {$(VPATH)}encindex.h process.$(OBJEXT): {$(VPATH)}encoding.h process.$(OBJEXT): {$(VPATH)}fiber/scheduler.h process.$(OBJEXT): {$(VPATH)}hrtime.h @@ -12585,6 +12607,7 @@ ractor.$(OBJEXT): {$(VPATH)}config.h ractor.$(OBJEXT): {$(VPATH)}constant.h ractor.$(OBJEXT): {$(VPATH)}debug_counter.h ractor.$(OBJEXT): {$(VPATH)}defines.h +ractor.$(OBJEXT): {$(VPATH)}encindex.h ractor.$(OBJEXT): {$(VPATH)}encoding.h ractor.$(OBJEXT): {$(VPATH)}eval_intern.h ractor.$(OBJEXT): {$(VPATH)}id.h @@ -13018,6 +13041,7 @@ range.$(OBJEXT): {$(VPATH)}backward/2/stdalign.h range.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h range.$(OBJEXT): {$(VPATH)}config.h range.$(OBJEXT): {$(VPATH)}defines.h +range.$(OBJEXT): {$(VPATH)}encindex.h range.$(OBJEXT): {$(VPATH)}encoding.h range.$(OBJEXT): {$(VPATH)}id.h range.$(OBJEXT): {$(VPATH)}id_table.h @@ -14688,6 +14712,7 @@ ruby.$(OBJEXT): {$(VPATH)}constant.h ruby.$(OBJEXT): {$(VPATH)}debug_counter.h ruby.$(OBJEXT): {$(VPATH)}defines.h ruby.$(OBJEXT): {$(VPATH)}dln.h +ruby.$(OBJEXT): {$(VPATH)}encindex.h ruby.$(OBJEXT): {$(VPATH)}encoding.h ruby.$(OBJEXT): {$(VPATH)}eval_intern.h ruby.$(OBJEXT): {$(VPATH)}id.h @@ -14896,6 +14921,7 @@ ruby_parser.$(OBJEXT): {$(VPATH)}backward/2/stdalign.h ruby_parser.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h ruby_parser.$(OBJEXT): {$(VPATH)}config.h ruby_parser.$(OBJEXT): {$(VPATH)}defines.h +ruby_parser.$(OBJEXT): {$(VPATH)}encindex.h ruby_parser.$(OBJEXT): {$(VPATH)}encoding.h ruby_parser.$(OBJEXT): {$(VPATH)}intern.h ruby_parser.$(OBJEXT): {$(VPATH)}internal.h @@ -15306,6 +15332,7 @@ set.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h set.$(OBJEXT): {$(VPATH)}config.h set.$(OBJEXT): {$(VPATH)}constant.h set.$(OBJEXT): {$(VPATH)}defines.h +set.$(OBJEXT): {$(VPATH)}encindex.h set.$(OBJEXT): {$(VPATH)}encoding.h set.$(OBJEXT): {$(VPATH)}id.h set.$(OBJEXT): {$(VPATH)}id_table.h @@ -15678,6 +15705,7 @@ shape.$(OBJEXT): {$(VPATH)}config.h shape.$(OBJEXT): {$(VPATH)}constant.h shape.$(OBJEXT): {$(VPATH)}debug_counter.h shape.$(OBJEXT): {$(VPATH)}defines.h +shape.$(OBJEXT): {$(VPATH)}encindex.h shape.$(OBJEXT): {$(VPATH)}encoding.h shape.$(OBJEXT): {$(VPATH)}id.h shape.$(OBJEXT): {$(VPATH)}id_table.h @@ -15892,6 +15920,7 @@ signal.$(OBJEXT): {$(VPATH)}config.h signal.$(OBJEXT): {$(VPATH)}constant.h signal.$(OBJEXT): {$(VPATH)}debug_counter.h signal.$(OBJEXT): {$(VPATH)}defines.h +signal.$(OBJEXT): {$(VPATH)}encindex.h signal.$(OBJEXT): {$(VPATH)}encoding.h signal.$(OBJEXT): {$(VPATH)}eval_intern.h signal.$(OBJEXT): {$(VPATH)}id.h @@ -16101,6 +16130,7 @@ sprintf.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h sprintf.$(OBJEXT): {$(VPATH)}config.h sprintf.$(OBJEXT): {$(VPATH)}constant.h sprintf.$(OBJEXT): {$(VPATH)}defines.h +sprintf.$(OBJEXT): {$(VPATH)}encindex.h sprintf.$(OBJEXT): {$(VPATH)}encoding.h sprintf.$(OBJEXT): {$(VPATH)}id.h sprintf.$(OBJEXT): {$(VPATH)}id_table.h @@ -16457,6 +16487,7 @@ strftime.$(OBJEXT): {$(VPATH)}backward/2/stdalign.h strftime.$(OBJEXT): {$(VPATH)}backward/2/stdarg.h strftime.$(OBJEXT): {$(VPATH)}config.h strftime.$(OBJEXT): {$(VPATH)}defines.h +strftime.$(OBJEXT): {$(VPATH)}encindex.h strftime.$(OBJEXT): {$(VPATH)}encoding.h strftime.$(OBJEXT): {$(VPATH)}intern.h strftime.$(OBJEXT): {$(VPATH)}internal.h @@ -16925,6 +16956,7 @@ struct.$(OBJEXT): {$(VPATH)}config.h struct.$(OBJEXT): {$(VPATH)}constant.h struct.$(OBJEXT): {$(VPATH)}debug_counter.h struct.$(OBJEXT): {$(VPATH)}defines.h +struct.$(OBJEXT): {$(VPATH)}encindex.h struct.$(OBJEXT): {$(VPATH)}encoding.h struct.$(OBJEXT): {$(VPATH)}id.h struct.$(OBJEXT): {$(VPATH)}id_table.h @@ -17141,6 +17173,7 @@ symbol.$(OBJEXT): {$(VPATH)}constant.h symbol.$(OBJEXT): {$(VPATH)}darray.h symbol.$(OBJEXT): {$(VPATH)}debug_counter.h symbol.$(OBJEXT): {$(VPATH)}defines.h +symbol.$(OBJEXT): {$(VPATH)}encindex.h symbol.$(OBJEXT): {$(VPATH)}encoding.h symbol.$(OBJEXT): {$(VPATH)}id.c symbol.$(OBJEXT): {$(VPATH)}id.h @@ -17398,6 +17431,7 @@ thread.$(OBJEXT): {$(VPATH)}constant.h thread.$(OBJEXT): {$(VPATH)}debug.h thread.$(OBJEXT): {$(VPATH)}debug_counter.h thread.$(OBJEXT): {$(VPATH)}defines.h +thread.$(OBJEXT): {$(VPATH)}encindex.h thread.$(OBJEXT): {$(VPATH)}encoding.h thread.$(OBJEXT): {$(VPATH)}eval_intern.h thread.$(OBJEXT): {$(VPATH)}fiber/scheduler.h @@ -17628,6 +17662,7 @@ time.$(OBJEXT): {$(VPATH)}builtin.h time.$(OBJEXT): {$(VPATH)}config.h time.$(OBJEXT): {$(VPATH)}constant.h time.$(OBJEXT): {$(VPATH)}defines.h +time.$(OBJEXT): {$(VPATH)}encindex.h time.$(OBJEXT): {$(VPATH)}encoding.h time.$(OBJEXT): {$(VPATH)}id.h time.$(OBJEXT): {$(VPATH)}id_table.h @@ -17830,6 +17865,7 @@ transcode.$(OBJEXT): {$(VPATH)}config.h transcode.$(OBJEXT): {$(VPATH)}constant.h transcode.$(OBJEXT): {$(VPATH)}debug_counter.h transcode.$(OBJEXT): {$(VPATH)}defines.h +transcode.$(OBJEXT): {$(VPATH)}encindex.h transcode.$(OBJEXT): {$(VPATH)}encoding.h transcode.$(OBJEXT): {$(VPATH)}id.h transcode.$(OBJEXT): {$(VPATH)}id_table.h @@ -18211,6 +18247,7 @@ variable.$(OBJEXT): {$(VPATH)}config.h variable.$(OBJEXT): {$(VPATH)}constant.h variable.$(OBJEXT): {$(VPATH)}debug_counter.h variable.$(OBJEXT): {$(VPATH)}defines.h +variable.$(OBJEXT): {$(VPATH)}encindex.h variable.$(OBJEXT): {$(VPATH)}encoding.h variable.$(OBJEXT): {$(VPATH)}id.h variable.$(OBJEXT): {$(VPATH)}id_table.h @@ -18687,6 +18724,7 @@ vm.$(OBJEXT): {$(VPATH)}constant.h vm.$(OBJEXT): {$(VPATH)}debug_counter.h vm.$(OBJEXT): {$(VPATH)}defines.h vm.$(OBJEXT): {$(VPATH)}defs/opt_operand.def +vm.$(OBJEXT): {$(VPATH)}encindex.h vm.$(OBJEXT): {$(VPATH)}encoding.h vm.$(OBJEXT): {$(VPATH)}eval_intern.h vm.$(OBJEXT): {$(VPATH)}id.h @@ -18951,6 +18989,7 @@ vm_backtrace.$(OBJEXT): {$(VPATH)}constant.h vm_backtrace.$(OBJEXT): {$(VPATH)}debug.h vm_backtrace.$(OBJEXT): {$(VPATH)}debug_counter.h vm_backtrace.$(OBJEXT): {$(VPATH)}defines.h +vm_backtrace.$(OBJEXT): {$(VPATH)}encindex.h vm_backtrace.$(OBJEXT): {$(VPATH)}encoding.h vm_backtrace.$(OBJEXT): {$(VPATH)}eval_intern.h vm_backtrace.$(OBJEXT): {$(VPATH)}id.h @@ -20087,6 +20126,7 @@ yjit.$(OBJEXT): {$(VPATH)}constant.h yjit.$(OBJEXT): {$(VPATH)}debug.h yjit.$(OBJEXT): {$(VPATH)}debug_counter.h yjit.$(OBJEXT): {$(VPATH)}defines.h +yjit.$(OBJEXT): {$(VPATH)}encindex.h yjit.$(OBJEXT): {$(VPATH)}encoding.h yjit.$(OBJEXT): {$(VPATH)}id.h yjit.$(OBJEXT): {$(VPATH)}id_table.h @@ -20342,6 +20382,7 @@ zjit.$(OBJEXT): {$(VPATH)}constant.h zjit.$(OBJEXT): {$(VPATH)}debug.h zjit.$(OBJEXT): {$(VPATH)}debug_counter.h zjit.$(OBJEXT): {$(VPATH)}defines.h +zjit.$(OBJEXT): {$(VPATH)}encindex.h zjit.$(OBJEXT): {$(VPATH)}encoding.h zjit.$(OBJEXT): {$(VPATH)}id.h zjit.$(OBJEXT): {$(VPATH)}id_table.h diff --git a/ext/-test-/stack/depend b/ext/-test-/stack/depend index 31571c882e6eb7..77e93bb201db09 100644 --- a/ext/-test-/stack/depend +++ b/ext/-test-/stack/depend @@ -172,6 +172,7 @@ stack.o: $(hdrdir)/ruby/oniguruma.h stack.o: $(hdrdir)/ruby/ruby.h stack.o: $(hdrdir)/ruby/st.h stack.o: $(hdrdir)/ruby/subst.h +stack.o: $(top_srcdir)/encindex.h stack.o: $(top_srcdir)/internal/compilers.h stack.o: $(top_srcdir)/internal/string.h stack.o: stack.c diff --git a/ext/-test-/string/depend b/ext/-test-/string/depend index de6e775accf395..478ae3b82b7500 100644 --- a/ext/-test-/string/depend +++ b/ext/-test-/string/depend @@ -172,6 +172,7 @@ capacity.o: $(hdrdir)/ruby/oniguruma.h capacity.o: $(hdrdir)/ruby/ruby.h capacity.o: $(hdrdir)/ruby/st.h capacity.o: $(hdrdir)/ruby/subst.h +capacity.o: $(top_srcdir)/encindex.h capacity.o: $(top_srcdir)/internal/compilers.h capacity.o: $(top_srcdir)/internal/string.h capacity.o: capacity.c @@ -679,6 +680,7 @@ cstr.o: $(hdrdir)/ruby/oniguruma.h cstr.o: $(hdrdir)/ruby/ruby.h cstr.o: $(hdrdir)/ruby/st.h cstr.o: $(hdrdir)/ruby/subst.h +cstr.o: $(top_srcdir)/encindex.h cstr.o: $(top_srcdir)/internal.h cstr.o: $(top_srcdir)/internal/compilers.h cstr.o: $(top_srcdir)/internal/string.h @@ -1535,6 +1537,7 @@ fstring.o: $(hdrdir)/ruby/oniguruma.h fstring.o: $(hdrdir)/ruby/ruby.h fstring.o: $(hdrdir)/ruby/st.h fstring.o: $(hdrdir)/ruby/subst.h +fstring.o: $(top_srcdir)/encindex.h fstring.o: $(top_srcdir)/internal/compilers.h fstring.o: $(top_srcdir)/internal/string.h fstring.o: fstring.c diff --git a/ext/objspace/depend b/ext/objspace/depend index 04b26eb6c26567..d9dfc0c42b4740 100644 --- a/ext/objspace/depend +++ b/ext/objspace/depend @@ -602,6 +602,7 @@ objspace_dump.o: $(top_srcdir)/ccan/list/list.h objspace_dump.o: $(top_srcdir)/ccan/str/str.h objspace_dump.o: $(top_srcdir)/constant.h objspace_dump.o: $(top_srcdir)/debug_counter.h +objspace_dump.o: $(top_srcdir)/encindex.h objspace_dump.o: $(top_srcdir)/id_table.h objspace_dump.o: $(top_srcdir)/internal.h objspace_dump.o: $(top_srcdir)/internal/array.h diff --git a/ext/ripper/depend b/ext/ripper/depend index bd2de759065ba2..944da25ee94f38 100644 --- a/ext/ripper/depend +++ b/ext/ripper/depend @@ -578,6 +578,7 @@ ripper.o: $(top_srcdir)/ccan/container_of/container_of.h ripper.o: $(top_srcdir)/ccan/list/list.h ripper.o: $(top_srcdir)/ccan/str/str.h ripper.o: $(top_srcdir)/constant.h +ripper.o: $(top_srcdir)/encindex.h ripper.o: $(top_srcdir)/id_table.h ripper.o: $(top_srcdir)/internal.h ripper.o: $(top_srcdir)/internal/array.h diff --git a/ext/socket/depend b/ext/socket/depend index 3573dc45e2fa9f..77f6239a3da52b 100644 --- a/ext/socket/depend +++ b/ext/socket/depend @@ -193,6 +193,7 @@ ancdata.o: $(top_srcdir)/ccan/check_type/check_type.h ancdata.o: $(top_srcdir)/ccan/container_of/container_of.h ancdata.o: $(top_srcdir)/ccan/list/list.h ancdata.o: $(top_srcdir)/ccan/str/str.h +ancdata.o: $(top_srcdir)/encindex.h ancdata.o: $(top_srcdir)/id_table.h ancdata.o: $(top_srcdir)/internal.h ancdata.o: $(top_srcdir)/internal/array.h @@ -408,6 +409,7 @@ basicsocket.o: $(top_srcdir)/ccan/check_type/check_type.h basicsocket.o: $(top_srcdir)/ccan/container_of/container_of.h basicsocket.o: $(top_srcdir)/ccan/list/list.h basicsocket.o: $(top_srcdir)/ccan/str/str.h +basicsocket.o: $(top_srcdir)/encindex.h basicsocket.o: $(top_srcdir)/id_table.h basicsocket.o: $(top_srcdir)/internal.h basicsocket.o: $(top_srcdir)/internal/array.h @@ -623,6 +625,7 @@ constants.o: $(top_srcdir)/ccan/check_type/check_type.h constants.o: $(top_srcdir)/ccan/container_of/container_of.h constants.o: $(top_srcdir)/ccan/list/list.h constants.o: $(top_srcdir)/ccan/str/str.h +constants.o: $(top_srcdir)/encindex.h constants.o: $(top_srcdir)/id_table.h constants.o: $(top_srcdir)/internal.h constants.o: $(top_srcdir)/internal/array.h @@ -839,6 +842,7 @@ ifaddr.o: $(top_srcdir)/ccan/check_type/check_type.h ifaddr.o: $(top_srcdir)/ccan/container_of/container_of.h ifaddr.o: $(top_srcdir)/ccan/list/list.h ifaddr.o: $(top_srcdir)/ccan/str/str.h +ifaddr.o: $(top_srcdir)/encindex.h ifaddr.o: $(top_srcdir)/id_table.h ifaddr.o: $(top_srcdir)/internal.h ifaddr.o: $(top_srcdir)/internal/array.h @@ -1054,6 +1058,7 @@ init.o: $(top_srcdir)/ccan/check_type/check_type.h init.o: $(top_srcdir)/ccan/container_of/container_of.h init.o: $(top_srcdir)/ccan/list/list.h init.o: $(top_srcdir)/ccan/str/str.h +init.o: $(top_srcdir)/encindex.h init.o: $(top_srcdir)/id_table.h init.o: $(top_srcdir)/internal.h init.o: $(top_srcdir)/internal/array.h @@ -1269,6 +1274,7 @@ ipsocket.o: $(top_srcdir)/ccan/check_type/check_type.h ipsocket.o: $(top_srcdir)/ccan/container_of/container_of.h ipsocket.o: $(top_srcdir)/ccan/list/list.h ipsocket.o: $(top_srcdir)/ccan/str/str.h +ipsocket.o: $(top_srcdir)/encindex.h ipsocket.o: $(top_srcdir)/id_table.h ipsocket.o: $(top_srcdir)/internal.h ipsocket.o: $(top_srcdir)/internal/array.h @@ -1484,6 +1490,7 @@ option.o: $(top_srcdir)/ccan/check_type/check_type.h option.o: $(top_srcdir)/ccan/container_of/container_of.h option.o: $(top_srcdir)/ccan/list/list.h option.o: $(top_srcdir)/ccan/str/str.h +option.o: $(top_srcdir)/encindex.h option.o: $(top_srcdir)/id_table.h option.o: $(top_srcdir)/internal.h option.o: $(top_srcdir)/internal/array.h @@ -1699,6 +1706,7 @@ raddrinfo.o: $(top_srcdir)/ccan/check_type/check_type.h raddrinfo.o: $(top_srcdir)/ccan/container_of/container_of.h raddrinfo.o: $(top_srcdir)/ccan/list/list.h raddrinfo.o: $(top_srcdir)/ccan/str/str.h +raddrinfo.o: $(top_srcdir)/encindex.h raddrinfo.o: $(top_srcdir)/id_table.h raddrinfo.o: $(top_srcdir)/internal.h raddrinfo.o: $(top_srcdir)/internal/array.h @@ -1914,6 +1922,7 @@ socket.o: $(top_srcdir)/ccan/check_type/check_type.h socket.o: $(top_srcdir)/ccan/container_of/container_of.h socket.o: $(top_srcdir)/ccan/list/list.h socket.o: $(top_srcdir)/ccan/str/str.h +socket.o: $(top_srcdir)/encindex.h socket.o: $(top_srcdir)/id_table.h socket.o: $(top_srcdir)/internal.h socket.o: $(top_srcdir)/internal/array.h @@ -2129,6 +2138,7 @@ sockssocket.o: $(top_srcdir)/ccan/check_type/check_type.h sockssocket.o: $(top_srcdir)/ccan/container_of/container_of.h sockssocket.o: $(top_srcdir)/ccan/list/list.h sockssocket.o: $(top_srcdir)/ccan/str/str.h +sockssocket.o: $(top_srcdir)/encindex.h sockssocket.o: $(top_srcdir)/id_table.h sockssocket.o: $(top_srcdir)/internal.h sockssocket.o: $(top_srcdir)/internal/array.h @@ -2344,6 +2354,7 @@ tcpserver.o: $(top_srcdir)/ccan/check_type/check_type.h tcpserver.o: $(top_srcdir)/ccan/container_of/container_of.h tcpserver.o: $(top_srcdir)/ccan/list/list.h tcpserver.o: $(top_srcdir)/ccan/str/str.h +tcpserver.o: $(top_srcdir)/encindex.h tcpserver.o: $(top_srcdir)/id_table.h tcpserver.o: $(top_srcdir)/internal.h tcpserver.o: $(top_srcdir)/internal/array.h @@ -2559,6 +2570,7 @@ tcpsocket.o: $(top_srcdir)/ccan/check_type/check_type.h tcpsocket.o: $(top_srcdir)/ccan/container_of/container_of.h tcpsocket.o: $(top_srcdir)/ccan/list/list.h tcpsocket.o: $(top_srcdir)/ccan/str/str.h +tcpsocket.o: $(top_srcdir)/encindex.h tcpsocket.o: $(top_srcdir)/id_table.h tcpsocket.o: $(top_srcdir)/internal.h tcpsocket.o: $(top_srcdir)/internal/array.h @@ -2774,6 +2786,7 @@ udpsocket.o: $(top_srcdir)/ccan/check_type/check_type.h udpsocket.o: $(top_srcdir)/ccan/container_of/container_of.h udpsocket.o: $(top_srcdir)/ccan/list/list.h udpsocket.o: $(top_srcdir)/ccan/str/str.h +udpsocket.o: $(top_srcdir)/encindex.h udpsocket.o: $(top_srcdir)/id_table.h udpsocket.o: $(top_srcdir)/internal.h udpsocket.o: $(top_srcdir)/internal/array.h @@ -2989,6 +3002,7 @@ unixserver.o: $(top_srcdir)/ccan/check_type/check_type.h unixserver.o: $(top_srcdir)/ccan/container_of/container_of.h unixserver.o: $(top_srcdir)/ccan/list/list.h unixserver.o: $(top_srcdir)/ccan/str/str.h +unixserver.o: $(top_srcdir)/encindex.h unixserver.o: $(top_srcdir)/id_table.h unixserver.o: $(top_srcdir)/internal.h unixserver.o: $(top_srcdir)/internal/array.h @@ -3204,6 +3218,7 @@ unixsocket.o: $(top_srcdir)/ccan/check_type/check_type.h unixsocket.o: $(top_srcdir)/ccan/container_of/container_of.h unixsocket.o: $(top_srcdir)/ccan/list/list.h unixsocket.o: $(top_srcdir)/ccan/str/str.h +unixsocket.o: $(top_srcdir)/encindex.h unixsocket.o: $(top_srcdir)/id_table.h unixsocket.o: $(top_srcdir)/internal.h unixsocket.o: $(top_srcdir)/internal/array.h diff --git a/file.c b/file.c index 89294ea009e7b3..8658b306774aad 100644 --- a/file.c +++ b/file.c @@ -169,6 +169,7 @@ typedef struct timespec stat_timestamp; #include "internal.h" #include "internal/compilers.h" #include "internal/dir.h" +#include "internal/encoding.h" #include "internal/error.h" #include "internal/file.h" #include "internal/io.h" @@ -3713,6 +3714,22 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc) return (char *)path; } +static char * +single_byte_chompdirsep(const char *path, const char *end) +{ + while (path < end) { + if (isdirsep(*path)) { + const char *last = path++; + while (path < end && isdirsep(*path)) path++; + if (path >= end) return (char *)last; + } + else { + path++; + } + } + return (char *)path; +} + char * rb_enc_path_end(const char *path, const char *end, rb_encoding *enc) { @@ -3723,7 +3740,7 @@ rb_enc_path_end(const char *path, const char *end, rb_encoding *enc) static rb_encoding * fs_enc_check(VALUE path1, VALUE path2) { - rb_encoding *enc = rb_enc_check(path1, path2); + rb_encoding *enc = rb_enc_check_str(path1, path2); int encidx = rb_enc_to_index(enc); if (encidx == ENCINDEX_US_ASCII) { encidx = rb_enc_get_index(path1); @@ -4651,7 +4668,7 @@ rb_check_realpath_emulate(VALUE basedir, VALUE path, rb_encoding *origenc, enum return resolved; } -static VALUE rb_file_join(VALUE ary); +static VALUE rb_file_join(long argc, VALUE *args); #ifndef HAVE_REALPATH static VALUE @@ -4692,7 +4709,8 @@ rb_check_realpath_internal(VALUE basedir, VALUE path, rb_encoding *origenc, enum unresolved_path = rb_str_dup_frozen(path); if (*RSTRING_PTR(unresolved_path) != '/' && !NIL_P(basedir)) { - unresolved_path = rb_file_join(rb_assoc_new(basedir, unresolved_path)); + VALUE paths[2] = {basedir, unresolved_path}; + unresolved_path = rb_file_join(2, paths); } if (origenc) unresolved_path = TO_OSPATH(unresolved_path); @@ -5255,15 +5273,17 @@ rb_file_s_split(VALUE klass, VALUE path) return rb_assoc_new(rb_file_dirname(path), rb_file_s_basename(1,&path,Qundef)); } +static VALUE rb_file_join_ary(VALUE ary); + static VALUE file_inspect_join(VALUE ary, VALUE arg, int recur) { if (recur || ary == arg) rb_raise(rb_eArgError, "recursive array"); - return rb_file_join(arg); + return rb_file_join_ary(arg); } static VALUE -rb_file_join(VALUE ary) +rb_file_join_ary(VALUE ary) { long len, i; VALUE result, tmp; @@ -5328,6 +5348,69 @@ rb_file_join(VALUE ary) return result; } +static inline VALUE +rb_file_join_fastpath(long argc, VALUE *args) +{ + long size = argc; + + long i; + for (i = 0; i < argc; i++) { + VALUE tmp = args[i]; + if (RB_LIKELY(RB_TYPE_P(tmp, T_STRING) && rb_str_enc_fastpath(tmp))) { + size += RSTRING_LEN(tmp); + } + else { + return 0; + } + } + + VALUE result = rb_str_buf_new(size); + + StringValueCStr(args[0]); + int encidx = ENCODING_GET_INLINED(args[0]); + ENCODING_SET_INLINED(result, encidx); + rb_str_buf_append(result, args[0]); + + const char *name = RSTRING_PTR(result); + for (i = 1; i < argc; i++) { + VALUE tmp = args[i]; + StringValueCStr(tmp); + long len = RSTRING_LEN(result); + + const char *tail = single_byte_chompdirsep(name, name + len); + if (RSTRING_PTR(tmp) && isdirsep(RSTRING_PTR(tmp)[0])) { + rb_str_set_len(result, tail - name); + } + else if (!*tail) { + rb_str_cat(result, "/", 1); + } + + if (RB_UNLIKELY(ENCODING_GET_INLINED(tmp) != encidx)) { + rb_encoding *new_enc = fs_enc_check(result, tmp); + rb_enc_associate(result, new_enc); + encidx = rb_enc_to_index(new_enc); + } + + rb_str_buf_append(result, tmp); + } + + return result; +} + +static inline VALUE +rb_file_join(long argc, VALUE *args) +{ + if (RB_UNLIKELY(argc == 0)) { + return rb_str_new(0, 0); + } + + VALUE result = rb_file_join_fastpath(argc, args); + if (RB_LIKELY(result)) { + return result; + } + + return rb_file_join_ary(rb_ary_new_from_values(argc, args)); +} /* * call-seq: * File.join(string, ...) -> string @@ -5340,9 +5423,9 @@ rb_file_join(VALUE ary) */ static VALUE -rb_file_s_join(VALUE klass, VALUE args) +rb_file_s_join(int argc, VALUE *argv, VALUE klass) { - return rb_file_join(args); + return rb_file_join(argc, argv); } #if defined(HAVE_TRUNCATE) @@ -7584,7 +7667,7 @@ Init_File(void) /* separates directory parts in path */ rb_define_const(rb_cFile, "SEPARATOR", separator); rb_define_singleton_method(rb_cFile, "split", rb_file_s_split, 1); - rb_define_singleton_method(rb_cFile, "join", rb_file_s_join, -2); + rb_define_singleton_method(rb_cFile, "join", rb_file_s_join, -1); #ifdef DOSISH /* platform specific alternative separator */ diff --git a/internal/string.h b/internal/string.h index d6fea62061ddfa..ea81db7ed39dbf 100644 --- a/internal/string.h +++ b/internal/string.h @@ -14,6 +14,7 @@ #include "ruby/internal/stdbool.h" /* for bool */ #include "ruby/encoding.h" /* for rb_encoding */ #include "ruby/ruby.h" /* for VALUE */ +#include "encindex.h" #define STR_SHARED FL_USER0 /* = ELTS_SHARED */ #define STR_NOEMBED FL_USER1 @@ -29,6 +30,26 @@ enum ruby_rstring_private_flags { # undef rb_fstring_cstr #endif +static inline bool +rb_str_encindex_fastpath(int encindex) +{ + // The overwhelming majority of strings are in one of these 3 encodings. + switch (encindex) { + case ENCINDEX_ASCII_8BIT: + case ENCINDEX_UTF_8: + case ENCINDEX_US_ASCII: + return true; + default: + return false; + } +} + +static inline bool +rb_str_enc_fastpath(VALUE str) +{ + return rb_str_encindex_fastpath(ENCODING_GET_INLINED(str)); +} + /* string.c */ VALUE rb_str_dup_m(VALUE str); VALUE rb_fstring(VALUE); diff --git a/string.c b/string.c index 2d74c46a360aa8..cfadabd3794fc2 100644 --- a/string.c +++ b/string.c @@ -146,27 +146,7 @@ VALUE rb_cSymbol; RSTRING(str)->len = (n); \ } while (0) -static inline bool -str_encindex_fastpath(int encindex) -{ - // The overwhelming majority of strings are in one of these 3 encodings. - switch (encindex) { - case ENCINDEX_ASCII_8BIT: - case ENCINDEX_UTF_8: - case ENCINDEX_US_ASCII: - return true; - default: - return false; - } -} - -static inline bool -str_enc_fastpath(VALUE str) -{ - return str_encindex_fastpath(ENCODING_GET_INLINED(str)); -} - -#define TERM_LEN(str) (str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str)))) +#define TERM_LEN(str) (rb_str_enc_fastpath(str) ? 1 : rb_enc_mbminlen(rb_enc_from_index(ENCODING_GET(str)))) #define TERM_FILL(ptr, termlen) do {\ char *const term_fill_ptr = (ptr);\ const int term_fill_len = (termlen);\ @@ -960,7 +940,7 @@ static inline bool rb_enc_str_asciicompat(VALUE str) { int encindex = ENCODING_GET_INLINED(str); - return str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex)); + return rb_str_encindex_fastpath(encindex) || rb_enc_asciicompat(rb_enc_get_from_index(encindex)); } int @@ -2796,7 +2776,7 @@ rb_must_asciicompat(VALUE str) rb_raise(rb_eTypeError, "not encoding capable object"); } - if (RB_LIKELY(str_encindex_fastpath(encindex))) { + if (RB_LIKELY(rb_str_encindex_fastpath(encindex))) { return; } @@ -2897,16 +2877,21 @@ str_null_check(VALUE str, int *w) { char *s = RSTRING_PTR(str); long len = RSTRING_LEN(str); - rb_encoding *enc = rb_enc_get(str); - const int minlen = rb_enc_mbminlen(enc); + int minlen = 1; + + if (RB_UNLIKELY(!rb_str_enc_fastpath(str))) { + rb_encoding *enc = rb_enc_get(str); + minlen = rb_enc_mbminlen(enc); - if (minlen > 1) { - *w = 1; - if (str_null_char(s, len, minlen, enc)) { - return NULL; + if (minlen > 1) { + *w = 1; + if (str_null_char(s, len, minlen, enc)) { + return NULL; + } + return str_fill_term(str, s, len, minlen); } - return str_fill_term(str, s, len, minlen); } + *w = 0; if (!s || memchr(s, 0, len)) { return NULL; @@ -3765,7 +3750,7 @@ rb_str_buf_append(VALUE str, VALUE str2) { int str2_cr = rb_enc_str_coderange(str2); - if (str_enc_fastpath(str)) { + if (rb_str_enc_fastpath(str)) { switch (str2_cr) { case ENC_CODERANGE_7BIT: // If RHS is 7bit we can do simple concatenation From 7e0e9984d0250afbd67a17b8b2d6846f1595ddce Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 18 Jan 2026 13:49:59 +0100 Subject: [PATCH 3/6] rb_file_join_fastpath: optimize searching for separators `chompdirsep` searches from the start of the string each time, which perhaps is necessary for certain encodings (not even sure?) but for the common encodings it's very wasteful. Instead we can start from the back of the string and only compare one or two characters in most cases. Also replace `StringValueCStr` for the simpler `rb_str_null_check` as we only care about whether the string contains `NULL` bytes, we don't care whether it is NULL terminated or not. We also only check the final string for NULLs. ``` compare-ruby: ruby 4.1.0dev (2026-01-17T14:40:03Z master 00a3b71eaf) +PRISM [arm64-darwin25] built-ruby: ruby 4.1.0dev (2026-01-18T12:55:15Z spedup-file-join 5948e92e03) +PRISM [arm64-darwin25] warming up.... | |compare-ruby|built-ruby| |:-------------|-----------:|---------:| |two_strings | 2.477M| 19.317M| | | -| 7.80x| |many_strings | 547.577k| 10.298M| | | -| 18.81x| |array | 515.280k| 523.291k| | | -| 1.02x| |mixed | 621.840k| 635.422k| | | -| 1.02x| ``` --- file.c | 37 +++++++++++++++---------------------- internal/string.h | 1 + string.c | 28 ++++++++++++++++++++++++++++ 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/file.c b/file.c index 8658b306774aad..a47cc32acd2f77 100644 --- a/file.c +++ b/file.c @@ -3714,22 +3714,6 @@ chompdirsep(const char *path, const char *end, rb_encoding *enc) return (char *)path; } -static char * -single_byte_chompdirsep(const char *path, const char *end) -{ - while (path < end) { - if (isdirsep(*path)) { - const char *last = path++; - while (path < end && isdirsep(*path)) path++; - if (path >= end) return (char *)last; - } - else { - path++; - } - } - return (char *)path; -} - char * rb_enc_path_end(const char *path, const char *end, rb_encoding *enc) { @@ -5374,14 +5358,22 @@ rb_file_join_fastpath(long argc, VALUE *args) const char *name = RSTRING_PTR(result); for (i = 1; i < argc; i++) { VALUE tmp = args[i]; - StringValueCStr(tmp); long len = RSTRING_LEN(result); - const char *tail = single_byte_chompdirsep(name, name + len); - if (RSTRING_PTR(tmp) && isdirsep(RSTRING_PTR(tmp)[0])) { - rb_str_set_len(result, tail - name); + const char *tmp_s; + long tmp_len; + RSTRING_GETMEM(tmp, tmp_s, tmp_len); + + if (isdirsep(tmp_s[0])) { + // right side has a leading separator, remove left side separators. + long trailing_seps = 0; + while (isdirsep(name[len - trailing_seps - 1])) { + trailing_seps++; + } + rb_str_set_len(result, len - trailing_seps); } - else if (!*tail) { + else if (!isdirsep(name[len - 1])) { + // neither side have a separator, append one; rb_str_cat(result, "/", 1); } @@ -5391,9 +5383,10 @@ rb_file_join_fastpath(long argc, VALUE *args) encidx = rb_enc_to_index(new_enc); } - rb_str_buf_append(result, tmp); + rb_str_buf_cat(result, tmp_s, tmp_len); } + rb_str_null_check(result); return result; } diff --git a/internal/string.h b/internal/string.h index ea81db7ed39dbf..cd1e8d79296ef4 100644 --- a/internal/string.h +++ b/internal/string.h @@ -84,6 +84,7 @@ bool rb_str_reembeddable_p(VALUE); VALUE rb_str_upto_endless_each(VALUE, int (*each)(VALUE, VALUE), VALUE); VALUE rb_str_with_debug_created_info(VALUE, VALUE, int); VALUE rb_str_frozen_bare_string(VALUE); +const char *rb_str_null_check(VALUE); /* error.c */ void rb_warn_unchilled_literal(VALUE str); diff --git a/string.c b/string.c index cfadabd3794fc2..1e0b9929ef150c 100644 --- a/string.c +++ b/string.c @@ -2902,6 +2902,34 @@ str_null_check(VALUE str, int *w) return s; } +const char * +rb_str_null_check(VALUE str) +{ + RUBY_ASSERT(RB_TYPE_P(str, T_STRING)); + + char *s; + long len; + RSTRING_GETMEM(str, s, len); + + if (RB_LIKELY(rb_str_enc_fastpath(str))) { + if (!s || memchr(s, 0, len)) { + rb_raise(rb_eArgError, "string contains null byte"); + } + } + else { + int w; + const char *s = str_null_check(str, &w); + if (!s) { + if (w) { + rb_raise(rb_eArgError, "string contains null char"); + } + rb_raise(rb_eArgError, "string contains null byte"); + } + } + + return s; +} + char * rb_str_to_cstr(VALUE str) { From 19450d85d6caeb3f08c1c987ba447237c5697fa9 Mon Sep 17 00:00:00 2001 From: Peter Zhu Date: Sat, 17 Jan 2026 09:42:13 -0500 Subject: [PATCH 4/6] [DOC] Improve docs for ObjectSpace.define_finalizer --- gc.c | 83 ++++++++++++++++++++++++++++++++---------------------------- 1 file changed, 44 insertions(+), 39 deletions(-) diff --git a/gc.c b/gc.c index 1fe3dbf0aebb4b..ab0539cd3358a3 100644 --- a/gc.c +++ b/gc.c @@ -1709,63 +1709,68 @@ rb_gc_copy_finalizer(VALUE dest, VALUE obj) /* * call-seq: - * ObjectSpace.define_finalizer(obj, aProc=proc()) + * ObjectSpace.define_finalizer(obj) {|id| ... } -> array + * ObjectSpace.define_finalizer(obj, finalizer) -> array * - * Adds aProc as a finalizer, to be called after obj - * was destroyed. The object ID of the obj will be passed - * as an argument to aProc. If aProc is a lambda or - * method, make sure it can be called with a single argument. + * Adds a new finalizer for +obj+ that is called when +obj+ is destroyed + * by the garbage collector or when Ruby shuts down (which ever comes first). * - * The return value is an array [0, aProc]. + * With a block given, uses the block as the callback. Without a block given, + * uses a callable object +finalizer+ as the callback. The callback is called + * when +obj+ is destroyed with a single argument +id+ which is the object + * ID of +obj+ (see Object#object_id). * - * The two recommended patterns are to either create the finaliser proc - * in a non-instance method where it can safely capture the needed state, - * or to use a custom callable object that stores the needed state - * explicitly as instance variables. + * The return value is an array [0, callback], where +callback+ + * is a Proc created from the block if one was given or +finalizer+ otherwise. * - * class Foo - * def initialize(data_needed_for_finalization) - * ObjectSpace.define_finalizer(self, self.class.create_finalizer(data_needed_for_finalization)) - * end + * Note that defining a finalizer in an instance method of the object may prevent + * the object from being garbage collected since if the block or +finalizer+ refers + * to +obj+ then +obj+ will never be reclaimed by the garbage collector. For example, + * the following script demonstrates the issue: * - * def self.create_finalizer(data_needed_for_finalization) - * proc { - * puts "finalizing #{data_needed_for_finalization}" - * } + * class Foo + * def define_final + * ObjectSpace.define_finalizer(self) do |id| + * puts "Running finalizer for #{id}!" + * end * end * end * - * class Bar - * class Remover - * def initialize(data_needed_for_finalization) - * @data_needed_for_finalization = data_needed_for_finalization - * end + * obj = Foo.new + * obj.define_final * - * def call(id) - * puts "finalizing #{@data_needed_for_finalization}" - * end + * There are two patterns to solve this issue: + * + * - Create the finalizer in a non-instance method so it can safely capture + * the needed state: + * + * class Foo + * def define_final + * ObjectSpace.define_finalizer(self, self.class.create_finalizer) * end * - * def initialize(data_needed_for_finalization) - * ObjectSpace.define_finalizer(self, Remover.new(data_needed_for_finalization)) + * def self.create_finalizer + * proc do |id| + * puts "Running finalizer for #{id}!" + * end * end * end * - * Note that if your finalizer references the object to be - * finalized it will never be run on GC, although it will still be - * run at exit. You will get a warning if you capture the object - * to be finalized as the receiver of the finalizer. + * - Use a callable object: + * + * class Foo + * class Finalizer + * def call(id) + * puts "Running finalizer for #{id}!" + * end + * end * - * class CapturesSelf - * def initialize(name) - * ObjectSpace.define_finalizer(self, proc { - * # this finalizer will only be run on exit - * puts "finalizing #{name}" - * }) + * def define_final + * ObjectSpace.define_finalizer(self, Finalizer.new) * end * end * - * Also note that finalization can be unpredictable and is never guaranteed + * Note that finalization can be unpredictable and is never guaranteed * to be run except on exit. */ From 43d879d3aca3287f47d18ecdd8660965df24e3d3 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Sun, 18 Jan 2026 16:43:07 +0100 Subject: [PATCH 5/6] rb_file_join_fastpath: removed useless StringValueCStr --- file.c | 1 - 1 file changed, 1 deletion(-) diff --git a/file.c b/file.c index a47cc32acd2f77..2ed3672ea7672c 100644 --- a/file.c +++ b/file.c @@ -5350,7 +5350,6 @@ rb_file_join_fastpath(long argc, VALUE *args) VALUE result = rb_str_buf_new(size); - StringValueCStr(args[0]); int encidx = ENCODING_GET_INLINED(args[0]); ENCODING_SET_INLINED(result, encidx); rb_str_buf_append(result, args[0]); From 37c7ee536d88afbac4a9d8fba8d48717462502fd Mon Sep 17 00:00:00 2001 From: tomoya ishida Date: Mon, 19 Jan 2026 02:25:02 +0900 Subject: [PATCH 6/6] [DOC] Replace rdoc style codeblocks with markdown style backtick codeblocks in markdown documents (#15900) --- doc/contributing/documentation_guide.md | 14 +++++++------- doc/language/exceptions.md | 14 +++++++------- doc/language/options.md | 4 ++-- file.c | 14 +++++++------- 4 files changed, 23 insertions(+), 23 deletions(-) diff --git a/doc/contributing/documentation_guide.md b/doc/contributing/documentation_guide.md index 29a1c72b02f0de..4b1e2ac9adaf2e 100644 --- a/doc/contributing/documentation_guide.md +++ b/doc/contributing/documentation_guide.md @@ -265,16 +265,16 @@ and _never_ when referring to the class itself. When writing an explicit link, follow these guidelines. -#### +rdoc-ref+ Scheme +#### `rdoc-ref` Scheme -Use the +rdoc-ref+ scheme for: +Use the `rdoc-ref` scheme for: - A link in core documentation to other core documentation. - A link in core documentation to documentation in a standard library package. - A link in a standard library package to other documentation in that same standard library package. -See section "+rdoc-ref+ Scheme" in [links]. +See section "`rdoc-ref` Scheme" in [links]. #### URL-Based Link @@ -297,7 +297,7 @@ The name of a variable (as specified in its call-seq) should be marked up as [monofont]. Also, use monofont text for the name of a transient variable -(i.e., one defined and used only in the discussion, such as +n+). +(i.e., one defined and used only in the discussion, such as `n`). ### HTML Tags @@ -491,10 +491,10 @@ Return types: - If the method can return multiple different types, separate the types with "or" and, if necessary, commas. -- If the method can return multiple types, use +object+. -- If the method returns the receiver, use +self+. +- If the method can return multiple types, use `object`. +- If the method returns the receiver, use `self`. - If the method returns an object of the same class, - prefix `new_` if and only if the object is not +self+; + prefix `new_` if and only if the object is not `self`; example: `new_array`. Aliases: diff --git a/doc/language/exceptions.md b/doc/language/exceptions.md index 2c47455911376f..5f8f0ece69d2e6 100644 --- a/doc/language/exceptions.md +++ b/doc/language/exceptions.md @@ -504,18 +504,18 @@ These methods return backtrace information: By default, Ruby sets the backtrace of the exception to the location where it was raised. -The developer might adjust this by either providing +backtrace+ argument +The developer might adjust this by either providing `backtrace` argument to Kernel#raise, or using Exception#set_backtrace. Note that: -- by default, both +backtrace+ and +backtrace_locations+ represent the same backtrace; +- by default, both `backtrace` and `backtrace_locations` represent the same backtrace; - if the developer sets the backtrace by one of the above methods to an array of Thread::Backtrace::Location, they still represent the same backtrace; - if the developer sets the backtrace to a string or an array of strings: - - by Kernel#raise: +backtrace_locations+ become +nil+; - - by Exception#set_backtrace: +backtrace_locations+ preserve the original + - by Kernel#raise: `backtrace_locations` become `nil`; + - by Exception#set_backtrace: `backtrace_locations` preserve the original value; -- if the developer sets the backtrace to +nil+ by Exception#set_backtrace, - +backtrace_locations+ preserve the original value; but if the exception is then - reraised, both +backtrace+ and +backtrace_locations+ become the location of reraise. +- if the developer sets the backtrace to `nil` by Exception#set_backtrace, + `backtrace_locations` preserve the original value; but if the exception is then + reraised, both `backtrace` and `backtrace_locations` become the location of reraise. diff --git a/doc/language/options.md b/doc/language/options.md index cca87f42dedf0f..3421d73f552b40 100644 --- a/doc/language/options.md +++ b/doc/language/options.md @@ -640,7 +640,7 @@ Option `--encoding` is an alias for Option `--external-encoding` sets the default external encoding for the invoked Ruby program; -for values of +encoding+, +for values of `encoding`, see {Encoding: Names and Aliases}[rdoc-ref:encodings.rdoc@Names+and+Aliases]. ```console @@ -662,7 +662,7 @@ For a shorter help message, use option `-h`. Option `--internal-encoding` sets the default internal encoding for the invoked Ruby program; -for values of +encoding+, +for values of `encoding`, see {Encoding: Names and Aliases}[rdoc-ref:encodings.rdoc@Names+and+Aliases]. ```console diff --git a/file.c b/file.c index 2ed3672ea7672c..867b041a44af0f 100644 --- a/file.c +++ b/file.c @@ -5557,7 +5557,7 @@ rb_thread_flock(void *data) * call-seq: * flock(locking_constant) -> 0 or false * - * Locks or unlocks file +self+ according to the given `locking_constant`, + * Locks or unlocks file `self` according to the given `locking_constant`, * a bitwise OR of the values in the table below. * * Not available on all platforms. @@ -5567,10 +5567,10 @@ rb_thread_flock(void *data) * * | Constant | Lock | Effect * |-----------------|--------------|-----------------------------------------------------------------------------------------------------------------| - * | +File::LOCK_EX+ | Exclusive | Only one process may hold an exclusive lock for +self+ at a time. | - * | +File::LOCK_NB+ | Non-blocking | No blocking; may be combined with +File::LOCK_SH+ or +File::LOCK_EX+ using the bitwise OR operator \|. | - * | +File::LOCK_SH+ | Shared | Multiple processes may each hold a shared lock for +self+ at the same time. | - * | +File::LOCK_UN+ | Unlock | Remove an existing lock held by this process. | + * | `File::LOCK_EX` | Exclusive | Only one process may hold an exclusive lock for `self` at a time. | + * | `File::LOCK_NB` | Non-blocking | No blocking; may be combined with `File::LOCK_SH` or `File::LOCK_EX` using the bitwise OR operator \|. | + * | `File::LOCK_SH` | Shared | Multiple processes may each hold a shared lock for `self` at the same time. | + * | `File::LOCK_UN` | Unlock | Remove an existing lock held by this process. | * * Example: * @@ -5697,11 +5697,11 @@ test_check(int n, int argc, VALUE *argv) * | 'z' | Whether the entity exists and is of length zero. | * * - This test operates only on the entity at `path0`, - * and returns an integer size or +nil+: + * and returns an integer size or `nil`: * * | Character | Test | * |:------------:|:---------------------------------------------------------------------------------------------| - * | 's' | Returns positive integer size if the entity exists and has non-zero length, +nil+ otherwise. | + * | 's' | Returns positive integer size if the entity exists and has non-zero length, `nil` otherwise. | * * - Each of these tests operates only on the entity at `path0`, * and returns a Time object;