poky/meta/recipes-devtools/ruby/ruby/CVE-2024-49761-0001.patch

From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001
From: NAITOH Jun <naitoh@gmail.com>
Date: Wed, 17 Jan 2024 15:32:57 +0900
Subject: [PATCH] Use string scanner with baseparser (#105)

Using StringScanner reduces the string copying process and speeds up the
process.

And I removed unnecessary methods.

https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105

```
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
Calculating -------------------------------------
                     rexml 3.2.6      master  3.2.6(YJIT)  master(YJIT)
                 dom       4.868       5.077        8.137         8.303 i/s -     100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
                 sax      13.597      13.953       19.206        20.948 i/s -     100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
                pull      15.641      16.918       22.266        25.378 i/s -     100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
              stream      14.339      15.844       19.810        22.206 i/s -     100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s

Comparison:
                              dom
        master(YJIT):         8.3 i/s
         3.2.6(YJIT):         8.1 i/s - 1.02x  slower
              master:         5.1 i/s - 1.64x  slower
         rexml 3.2.6:         4.9 i/s - 1.71x  slower

                              sax
        master(YJIT):        20.9 i/s
         3.2.6(YJIT):        19.2 i/s - 1.09x  slower
              master:        14.0 i/s - 1.50x  slower
         rexml 3.2.6:        13.6 i/s - 1.54x  slower

                             pull
        master(YJIT):        25.4 i/s
         3.2.6(YJIT):        22.3 i/s - 1.14x  slower
              master:        16.9 i/s - 1.50x  slower
         rexml 3.2.6:        15.6 i/s - 1.62x  slower

                           stream
        master(YJIT):        22.2 i/s
         3.2.6(YJIT):        19.8 i/s - 1.12x  slower
              master:        15.8 i/s - 1.40x  slower
         rexml 3.2.6:        14.3 i/s - 1.55x  slower
```

- YJIT=ON : 1.02x - 1.14x faster
- YJIT=OFF : 1.02x - 1.10x faster

---------

Co-authored-by: Sutou Kouhei <kou@cozmixng.org>

CVE: CVE-2024-49761

Upstream-Status: Backport [https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c]

Signed-off-by: Divya Chellam <divya.chellam@windriver.com>
---
 .../lib/rexml/parsers/baseparser.rb           |  21 ++-
 .bundle/gems/rexml-3.2.5/lib/rexml/source.rb  | 149 ++++++------------
 2 files changed, 56 insertions(+), 114 deletions(-)

diff --git a/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb b/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
index 305b120..65bad26 100644
--- a/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
+++ b/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
@@ -96,7 +96,7 @@ module REXML
       ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
       PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
       GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
-      ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
+      ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um

       NOTATIONDECL_START = /\A\s*<!NOTATION/um
       EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
@@ -259,7 +259,7 @@ module REXML
           else
             @document_status = :after_doctype
             if @source.encoding == "UTF-8"
-              @source.buffer.force_encoding(::Encoding::UTF_8)
+              @source.buffer_encoding = ::Encoding::UTF_8
             end
           end
         end
@@ -274,8 +274,7 @@ module REXML
             return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]

           when ENTITY_START
-            match = @source.match( ENTITYDECL, true ).to_a.compact
-            match[0] = :entitydecl
+            match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
             ref = false
             if match[1] == '%'
               ref = true
@@ -392,6 +391,7 @@ module REXML
               unless md
                 raise REXML::ParseException.new("malformed XML: missing tag start", @source)
               end
+              tag = md[1]
               @document_status = :in_element
               prefixes = Set.new
               prefixes << md[2] if md[2]
@@ -405,23 +405,20 @@ module REXML
               end

               if closed
-                @closed = md[1]
+                @closed = tag
                 @nsstack.shift
               else
-                @tags.push( md[1] )
+                @tags.push( tag )
               end
-              return [ :start_element, md[1], attributes ]
+              return [ :start_element, tag, attributes ]
             end
           else
             md = @source.match( TEXT_PATTERN, true )
+            text = md[1]
             if md[0].length == 0
               @source.match( /(\s+)/, true )
             end
-            #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
-            #return [ :text, "" ] if md[0].length == 0
-            # unnormalized = Text::unnormalize( md[1], self )
-            # return PullEvent.new( :text, md[1], unnormalized )
-            return [ :text, md[1] ]
+            return [ :text, text ]
           end
         rescue REXML::UndefinedNamespaceException
           raise
diff --git a/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb b/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
index 90b370b..71b08f9 100644
--- a/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
+++ b/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
@@ -30,8 +30,6 @@ module REXML
   # objects and provides consumption of text
   class Source
     include Encoding
-    # The current buffer (what we're going to read next)
-    attr_reader :buffer
     # The line number of the last consumed text
     attr_reader :line
     attr_reader :encoding
@@ -41,7 +39,8 @@ module REXML
     # @param encoding if non-null, sets the encoding of the source to this
     # value, overriding all encoding detection
     def initialize(arg, encoding=nil)
-      @orig = @buffer = arg
+      @orig = arg
+      @scanner = StringScanner.new(@orig)
       if encoding
         self.encoding = encoding
       else
@@ -50,6 +49,14 @@ module REXML
       @line = 0
     end

+    # The current buffer (what we're going to read next)
+    def buffer
+      @scanner.rest
+    end
+
+    def buffer_encoding=(encoding)
+      @scanner.string.force_encoding(encoding)
+    end

     # Inherited from Encoding
     # Overridden to support optimized en/decoding
@@ -58,98 +65,57 @@ module REXML
       encoding_updated
     end

-    # Scans the source for a given pattern.  Note, that this is not your
-    # usual scan() method.  For one thing, the pattern argument has some
-    # requirements; for another, the source can be consumed.  You can easily
-    # confuse this method.  Originally, the patterns were easier
-    # to construct and this method more robust, because this method
-    # generated search regexps on the fly; however, this was
-    # computationally expensive and slowed down the entire REXML package
-    # considerably, since this is by far the most commonly called method.
-    # @param pattern must be a Regexp, and must be in the form of
-    # /^\s*(#{your pattern, with no groups})(.*)/.  The first group
-    # will be returned; the second group is used if the consume flag is
-    # set.
-    # @param consume if true, the pattern returned will be consumed, leaving
-    # everything after it in the Source.
-    # @return the pattern, if found, or nil if the Source is empty or the
-    # pattern is not found.
-    def scan(pattern, cons=false)
-      return nil if @buffer.nil?
-      rv = @buffer.scan(pattern)
-      @buffer = $' if cons and rv.size>0
-      rv
-    end
-
     def read
     end

-    def consume( pattern )
-      @buffer = $' if pattern.match( @buffer )
-    end
-
-    def match_to( char, pattern )
-      return pattern.match(@buffer)
-    end
-
-    def match_to_consume( char, pattern )
-      md = pattern.match(@buffer)
-      @buffer = $'
-      return md
-    end
-
     def match(pattern, cons=false)
-      md = pattern.match(@buffer)
-      @buffer = $' if cons and md
-      return md
+      if cons
+        @scanner.scan(pattern).nil? ? nil : @scanner
+      else
+        @scanner.check(pattern).nil? ? nil : @scanner
+      end
     end

     # @return true if the Source is exhausted
     def empty?
-      @buffer == ""
-    end
-
-    def position
-      @orig.index( @buffer )
+      @scanner.eos?
     end

     # @return the current line in the source
     def current_line
       lines = @orig.split
-      res = lines.grep @buffer[0..30]
+      res = lines.grep @scanner.rest[0..30]
       res = res[-1] if res.kind_of? Array
       lines.index( res ) if res
     end

     private
+
     def detect_encoding
-      buffer_encoding = @buffer.encoding
+      scanner_encoding = @scanner.rest.encoding
       detected_encoding = "UTF-8"
       begin
-        @buffer.force_encoding("ASCII-8BIT")
-        if @buffer[0, 2] == "\xfe\xff"
-          @buffer[0, 2] = ""
+        @scanner.string.force_encoding("ASCII-8BIT")
+        if @scanner.scan(/\xfe\xff/n)
           detected_encoding = "UTF-16BE"
-        elsif @buffer[0, 2] == "\xff\xfe"
-          @buffer[0, 2] = ""
+        elsif @scanner.scan(/\xff\xfe/n)
           detected_encoding = "UTF-16LE"
-        elsif @buffer[0, 3] == "\xef\xbb\xbf"
-          @buffer[0, 3] = ""
+        elsif @scanner.scan(/\xef\xbb\xbf/n)
           detected_encoding = "UTF-8"
         end
       ensure
-        @buffer.force_encoding(buffer_encoding)
+        @scanner.string.force_encoding(scanner_encoding)
       end
       self.encoding = detected_encoding
     end

     def encoding_updated
       if @encoding != 'UTF-8'
-        @buffer = decode(@buffer)
+        @scanner.string = decode(@scanner.rest)
         @to_utf = true
       else
         @to_utf = false
-        @buffer.force_encoding ::Encoding::UTF_8
+        @scanner.string.force_encoding(::Encoding::UTF_8)
       end
     end
   end
@@ -172,7 +138,7 @@ module REXML
       end

       if !@to_utf and
-          @buffer.respond_to?(:force_encoding) and
+          @orig.respond_to?(:force_encoding) and
           @source.respond_to?(:external_encoding) and
           @source.external_encoding != ::Encoding::UTF_8
         @force_utf8 = true
@@ -181,65 +147,44 @@ module REXML
       end
     end

-    def scan(pattern, cons=false)
-      rv = super
-      # You'll notice that this next section is very similar to the same
-      # section in match(), but just a liiittle different.  This is
-      # because it is a touch faster to do it this way with scan()
-      # than the way match() does it; enough faster to warrant duplicating
-      # some code
-      if rv.size == 0
-        until @buffer =~ pattern or @source.nil?
-          begin
-            @buffer << readline
-          rescue Iconv::IllegalSequence
-            raise
-          rescue
-            @source = nil
-          end
-        end
-        rv = super
-      end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
-    end
-
     def read
       begin
-        @buffer << readline
+        # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
+        # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
+        # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
+        # and avoids this problem.
+        @scanner.string = @scanner.rest + readline
       rescue Exception, NameError
         @source = nil
       end
     end

-    def consume( pattern )
-      match( pattern, true )
-    end
-
     def match( pattern, cons=false )
-      rv = pattern.match(@buffer)
-      @buffer = $' if cons and rv
-      while !rv and @source
+      if cons
+        md = @scanner.scan(pattern)
+      else
+        md = @scanner.check(pattern)
+      end
+      while md.nil? and @source
         begin
-          @buffer << readline
-          rv = pattern.match(@buffer)
-          @buffer = $' if cons and rv
+          @scanner << readline
+          if cons
+            md = @scanner.scan(pattern)
+          else
+            md = @scanner.check(pattern)
+          end
         rescue
           @source = nil
         end
       end
-      rv.taint if RUBY_VERSION < '2.7'
-      rv
+
+      md.nil? ? nil : @scanner
     end

     def empty?
       super and ( @source.nil? || @source.eof? )
     end

-    def position
-      @er_source.pos rescue 0
-    end
-
     # @return the current line in the source
     def current_line
       begin
@@ -290,7 +235,7 @@ module REXML
         @source.set_encoding(@encoding, @encoding)
       end
       @line_break = encode(">")
-      @pending_buffer, @buffer = @buffer, ""
+      @pending_buffer, @scanner.string = @scanner.rest, ""
       @pending_buffer.force_encoding(@encoding)
       super
     end
--
2.40.0