mirror of
https://git.yoctoproject.org/poky
synced 2026-03-09 16:59:40 +01:00
REXML is an XML toolkit for Ruby. The REXML gem before 3.3.9 has a ReDoS vulnerability when it parses an XML that has many digits between &# and x...; in a hex numeric character reference (&#x.... This does not happen with Ruby 3.2 or later. Ruby 3.1 is the only affected maintained Ruby. The REXML gem 3.3.9 or later include the patch to fix the vulnerability. CVE-2024-49761-0009.patch is the CVE fix and rest are dependent commits. Reference: https://nvd.nist.gov/vuln/detail/CVE-2024-49761 Upstream-patch:810d22852383ca5c4b0f51217dbcc67e4049f6a6fc6cad570b7712855547370666e314a579730f25ce59f2eb1a(From OE-Core rev: 5b453400e9dd878b81b1447d14b3f518809de17e) Signed-off-by: Divya Chellam <divya.chellam@windriver.com> Signed-off-by: Steve Sakoman <steve@sakoman.com>
392 lines
13 KiB
Diff
392 lines
13 KiB
Diff
From 810d2285235d5501a0a124f300832e6e9515da3c Mon Sep 17 00:00:00 2001
|
|
From: NAITOH Jun <naitoh@gmail.com>
|
|
Date: Wed, 17 Jan 2024 15:32:57 +0900
|
|
Subject: [PATCH] Use string scanner with baseparser (#105)
|
|
|
|
Using StringScanner reduces the string copying process and speeds up the
|
|
process.
|
|
|
|
And I removed unnecessary methods.
|
|
|
|
https://github.com/ruby/rexml/actions/runs/7549990000/job/20554906140?pr=105
|
|
|
|
```
|
|
ruby 3.3.0 (2023-12-25 revision 5124f9ac75) [x86_64-linux]
|
|
Calculating -------------------------------------
|
|
rexml 3.2.6 master 3.2.6(YJIT) master(YJIT)
|
|
dom 4.868 5.077 8.137 8.303 i/s - 100.000 times in 20.540529s 19.696590s 12.288900s 12.043666s
|
|
sax 13.597 13.953 19.206 20.948 i/s - 100.000 times in 7.354343s 7.167142s 5.206745s 4.773765s
|
|
pull 15.641 16.918 22.266 25.378 i/s - 100.000 times in 6.393424s 5.910955s 4.491201s 3.940471s
|
|
stream 14.339 15.844 19.810 22.206 i/s - 100.000 times in 6.973856s 6.311350s 5.047957s 4.503244s
|
|
|
|
Comparison:
|
|
dom
|
|
master(YJIT): 8.3 i/s
|
|
3.2.6(YJIT): 8.1 i/s - 1.02x slower
|
|
master: 5.1 i/s - 1.64x slower
|
|
rexml 3.2.6: 4.9 i/s - 1.71x slower
|
|
|
|
sax
|
|
master(YJIT): 20.9 i/s
|
|
3.2.6(YJIT): 19.2 i/s - 1.09x slower
|
|
master: 14.0 i/s - 1.50x slower
|
|
rexml 3.2.6: 13.6 i/s - 1.54x slower
|
|
|
|
pull
|
|
master(YJIT): 25.4 i/s
|
|
3.2.6(YJIT): 22.3 i/s - 1.14x slower
|
|
master: 16.9 i/s - 1.50x slower
|
|
rexml 3.2.6: 15.6 i/s - 1.62x slower
|
|
|
|
stream
|
|
master(YJIT): 22.2 i/s
|
|
3.2.6(YJIT): 19.8 i/s - 1.12x slower
|
|
master: 15.8 i/s - 1.40x slower
|
|
rexml 3.2.6: 14.3 i/s - 1.55x slower
|
|
```
|
|
|
|
- YJIT=ON : 1.02x - 1.14x faster
|
|
- YJIT=OFF : 1.02x - 1.10x faster
|
|
|
|
---------
|
|
|
|
Co-authored-by: Sutou Kouhei <kou@cozmixng.org>
|
|
|
|
CVE: CVE-2024-49761
|
|
|
|
Upstream-Status: Backport [https://github.com/ruby/rexml/commit/810d2285235d5501a0a124f300832e6e9515da3c]
|
|
|
|
Signed-off-by: Divya Chellam <divya.chellam@windriver.com>
|
|
---
|
|
.../lib/rexml/parsers/baseparser.rb | 21 ++-
|
|
.bundle/gems/rexml-3.2.5/lib/rexml/source.rb | 149 ++++++------------
|
|
2 files changed, 56 insertions(+), 114 deletions(-)
|
|
|
|
diff --git a/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb b/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
index 305b120..65bad26 100644
|
|
--- a/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
+++ b/.bundle/gems/rexml-3.2.5/lib/rexml/parsers/baseparser.rb
|
|
@@ -96,7 +96,7 @@ module REXML
|
|
ENTITYDEF = "(?:#{ENTITYVALUE}|(?:#{EXTERNALID}(#{NDATADECL})?))"
|
|
PEDECL = "<!ENTITY\\s+(%)\\s+#{NAME}\\s+#{PEDEF}\\s*>"
|
|
GEDECL = "<!ENTITY\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
|
|
- ENTITYDECL = /\s*(?:#{GEDECL})|(?:#{PEDECL})/um
|
|
+ ENTITYDECL = /\s*(?:#{GEDECL})|\s*(?:#{PEDECL})/um
|
|
|
|
NOTATIONDECL_START = /\A\s*<!NOTATION/um
|
|
EXTERNAL_ID_PUBLIC = /\A\s*PUBLIC\s+#{PUBIDLITERAL}\s+#{SYSTEMLITERAL}\s*/um
|
|
@@ -259,7 +259,7 @@ module REXML
|
|
else
|
|
@document_status = :after_doctype
|
|
if @source.encoding == "UTF-8"
|
|
- @source.buffer.force_encoding(::Encoding::UTF_8)
|
|
+ @source.buffer_encoding = ::Encoding::UTF_8
|
|
end
|
|
end
|
|
end
|
|
@@ -274,8 +274,7 @@ module REXML
|
|
return [ :elementdecl, @source.match( ELEMENTDECL_PATTERN, true )[1] ]
|
|
|
|
when ENTITY_START
|
|
- match = @source.match( ENTITYDECL, true ).to_a.compact
|
|
- match[0] = :entitydecl
|
|
+ match = [:entitydecl, *@source.match( ENTITYDECL, true ).captures.compact]
|
|
ref = false
|
|
if match[1] == '%'
|
|
ref = true
|
|
@@ -392,6 +391,7 @@ module REXML
|
|
unless md
|
|
raise REXML::ParseException.new("malformed XML: missing tag start", @source)
|
|
end
|
|
+ tag = md[1]
|
|
@document_status = :in_element
|
|
prefixes = Set.new
|
|
prefixes << md[2] if md[2]
|
|
@@ -405,23 +405,20 @@ module REXML
|
|
end
|
|
|
|
if closed
|
|
- @closed = md[1]
|
|
+ @closed = tag
|
|
@nsstack.shift
|
|
else
|
|
- @tags.push( md[1] )
|
|
+ @tags.push( tag )
|
|
end
|
|
- return [ :start_element, md[1], attributes ]
|
|
+ return [ :start_element, tag, attributes ]
|
|
end
|
|
else
|
|
md = @source.match( TEXT_PATTERN, true )
|
|
+ text = md[1]
|
|
if md[0].length == 0
|
|
@source.match( /(\s+)/, true )
|
|
end
|
|
- #STDERR.puts "GOT #{md[1].inspect}" unless md[0].length == 0
|
|
- #return [ :text, "" ] if md[0].length == 0
|
|
- # unnormalized = Text::unnormalize( md[1], self )
|
|
- # return PullEvent.new( :text, md[1], unnormalized )
|
|
- return [ :text, md[1] ]
|
|
+ return [ :text, text ]
|
|
end
|
|
rescue REXML::UndefinedNamespaceException
|
|
raise
|
|
diff --git a/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb b/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
|
|
index 90b370b..71b08f9 100644
|
|
--- a/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
|
|
+++ b/.bundle/gems/rexml-3.2.5/lib/rexml/source.rb
|
|
@@ -30,8 +30,6 @@ module REXML
|
|
# objects and provides consumption of text
|
|
class Source
|
|
include Encoding
|
|
- # The current buffer (what we're going to read next)
|
|
- attr_reader :buffer
|
|
# The line number of the last consumed text
|
|
attr_reader :line
|
|
attr_reader :encoding
|
|
@@ -41,7 +39,8 @@ module REXML
|
|
# @param encoding if non-null, sets the encoding of the source to this
|
|
# value, overriding all encoding detection
|
|
def initialize(arg, encoding=nil)
|
|
- @orig = @buffer = arg
|
|
+ @orig = arg
|
|
+ @scanner = StringScanner.new(@orig)
|
|
if encoding
|
|
self.encoding = encoding
|
|
else
|
|
@@ -50,6 +49,14 @@ module REXML
|
|
@line = 0
|
|
end
|
|
|
|
+ # The current buffer (what we're going to read next)
|
|
+ def buffer
|
|
+ @scanner.rest
|
|
+ end
|
|
+
|
|
+ def buffer_encoding=(encoding)
|
|
+ @scanner.string.force_encoding(encoding)
|
|
+ end
|
|
|
|
# Inherited from Encoding
|
|
# Overridden to support optimized en/decoding
|
|
@@ -58,98 +65,57 @@ module REXML
|
|
encoding_updated
|
|
end
|
|
|
|
- # Scans the source for a given pattern. Note, that this is not your
|
|
- # usual scan() method. For one thing, the pattern argument has some
|
|
- # requirements; for another, the source can be consumed. You can easily
|
|
- # confuse this method. Originally, the patterns were easier
|
|
- # to construct and this method more robust, because this method
|
|
- # generated search regexps on the fly; however, this was
|
|
- # computationally expensive and slowed down the entire REXML package
|
|
- # considerably, since this is by far the most commonly called method.
|
|
- # @param pattern must be a Regexp, and must be in the form of
|
|
- # /^\s*(#{your pattern, with no groups})(.*)/. The first group
|
|
- # will be returned; the second group is used if the consume flag is
|
|
- # set.
|
|
- # @param consume if true, the pattern returned will be consumed, leaving
|
|
- # everything after it in the Source.
|
|
- # @return the pattern, if found, or nil if the Source is empty or the
|
|
- # pattern is not found.
|
|
- def scan(pattern, cons=false)
|
|
- return nil if @buffer.nil?
|
|
- rv = @buffer.scan(pattern)
|
|
- @buffer = $' if cons and rv.size>0
|
|
- rv
|
|
- end
|
|
-
|
|
def read
|
|
end
|
|
|
|
- def consume( pattern )
|
|
- @buffer = $' if pattern.match( @buffer )
|
|
- end
|
|
-
|
|
- def match_to( char, pattern )
|
|
- return pattern.match(@buffer)
|
|
- end
|
|
-
|
|
- def match_to_consume( char, pattern )
|
|
- md = pattern.match(@buffer)
|
|
- @buffer = $'
|
|
- return md
|
|
- end
|
|
-
|
|
def match(pattern, cons=false)
|
|
- md = pattern.match(@buffer)
|
|
- @buffer = $' if cons and md
|
|
- return md
|
|
+ if cons
|
|
+ @scanner.scan(pattern).nil? ? nil : @scanner
|
|
+ else
|
|
+ @scanner.check(pattern).nil? ? nil : @scanner
|
|
+ end
|
|
end
|
|
|
|
# @return true if the Source is exhausted
|
|
def empty?
|
|
- @buffer == ""
|
|
- end
|
|
-
|
|
- def position
|
|
- @orig.index( @buffer )
|
|
+ @scanner.eos?
|
|
end
|
|
|
|
# @return the current line in the source
|
|
def current_line
|
|
lines = @orig.split
|
|
- res = lines.grep @buffer[0..30]
|
|
+ res = lines.grep @scanner.rest[0..30]
|
|
res = res[-1] if res.kind_of? Array
|
|
lines.index( res ) if res
|
|
end
|
|
|
|
private
|
|
+
|
|
def detect_encoding
|
|
- buffer_encoding = @buffer.encoding
|
|
+ scanner_encoding = @scanner.rest.encoding
|
|
detected_encoding = "UTF-8"
|
|
begin
|
|
- @buffer.force_encoding("ASCII-8BIT")
|
|
- if @buffer[0, 2] == "\xfe\xff"
|
|
- @buffer[0, 2] = ""
|
|
+ @scanner.string.force_encoding("ASCII-8BIT")
|
|
+ if @scanner.scan(/\xfe\xff/n)
|
|
detected_encoding = "UTF-16BE"
|
|
- elsif @buffer[0, 2] == "\xff\xfe"
|
|
- @buffer[0, 2] = ""
|
|
+ elsif @scanner.scan(/\xff\xfe/n)
|
|
detected_encoding = "UTF-16LE"
|
|
- elsif @buffer[0, 3] == "\xef\xbb\xbf"
|
|
- @buffer[0, 3] = ""
|
|
+ elsif @scanner.scan(/\xef\xbb\xbf/n)
|
|
detected_encoding = "UTF-8"
|
|
end
|
|
ensure
|
|
- @buffer.force_encoding(buffer_encoding)
|
|
+ @scanner.string.force_encoding(scanner_encoding)
|
|
end
|
|
self.encoding = detected_encoding
|
|
end
|
|
|
|
def encoding_updated
|
|
if @encoding != 'UTF-8'
|
|
- @buffer = decode(@buffer)
|
|
+ @scanner.string = decode(@scanner.rest)
|
|
@to_utf = true
|
|
else
|
|
@to_utf = false
|
|
- @buffer.force_encoding ::Encoding::UTF_8
|
|
+ @scanner.string.force_encoding(::Encoding::UTF_8)
|
|
end
|
|
end
|
|
end
|
|
@@ -172,7 +138,7 @@ module REXML
|
|
end
|
|
|
|
if !@to_utf and
|
|
- @buffer.respond_to?(:force_encoding) and
|
|
+ @orig.respond_to?(:force_encoding) and
|
|
@source.respond_to?(:external_encoding) and
|
|
@source.external_encoding != ::Encoding::UTF_8
|
|
@force_utf8 = true
|
|
@@ -181,65 +147,44 @@ module REXML
|
|
end
|
|
end
|
|
|
|
- def scan(pattern, cons=false)
|
|
- rv = super
|
|
- # You'll notice that this next section is very similar to the same
|
|
- # section in match(), but just a liiittle different. This is
|
|
- # because it is a touch faster to do it this way with scan()
|
|
- # than the way match() does it; enough faster to warrant duplicating
|
|
- # some code
|
|
- if rv.size == 0
|
|
- until @buffer =~ pattern or @source.nil?
|
|
- begin
|
|
- @buffer << readline
|
|
- rescue Iconv::IllegalSequence
|
|
- raise
|
|
- rescue
|
|
- @source = nil
|
|
- end
|
|
- end
|
|
- rv = super
|
|
- end
|
|
- rv.taint if RUBY_VERSION < '2.7'
|
|
- rv
|
|
- end
|
|
-
|
|
def read
|
|
begin
|
|
- @buffer << readline
|
|
+ # NOTE: `@scanner << readline` does not free memory, so when parsing huge XML in JRuby's DOM,
|
|
+ # out-of-memory error `Java::JavaLang::OutOfMemoryError: Java heap space` occurs.
|
|
+ # `@scanner.string = @scanner.rest + readline` frees memory that is already consumed
|
|
+ # and avoids this problem.
|
|
+ @scanner.string = @scanner.rest + readline
|
|
rescue Exception, NameError
|
|
@source = nil
|
|
end
|
|
end
|
|
|
|
- def consume( pattern )
|
|
- match( pattern, true )
|
|
- end
|
|
-
|
|
def match( pattern, cons=false )
|
|
- rv = pattern.match(@buffer)
|
|
- @buffer = $' if cons and rv
|
|
- while !rv and @source
|
|
+ if cons
|
|
+ md = @scanner.scan(pattern)
|
|
+ else
|
|
+ md = @scanner.check(pattern)
|
|
+ end
|
|
+ while md.nil? and @source
|
|
begin
|
|
- @buffer << readline
|
|
- rv = pattern.match(@buffer)
|
|
- @buffer = $' if cons and rv
|
|
+ @scanner << readline
|
|
+ if cons
|
|
+ md = @scanner.scan(pattern)
|
|
+ else
|
|
+ md = @scanner.check(pattern)
|
|
+ end
|
|
rescue
|
|
@source = nil
|
|
end
|
|
end
|
|
- rv.taint if RUBY_VERSION < '2.7'
|
|
- rv
|
|
+
|
|
+ md.nil? ? nil : @scanner
|
|
end
|
|
|
|
def empty?
|
|
super and ( @source.nil? || @source.eof? )
|
|
end
|
|
|
|
- def position
|
|
- @er_source.pos rescue 0
|
|
- end
|
|
-
|
|
# @return the current line in the source
|
|
def current_line
|
|
begin
|
|
@@ -290,7 +235,7 @@ module REXML
|
|
@source.set_encoding(@encoding, @encoding)
|
|
end
|
|
@line_break = encode(">")
|
|
- @pending_buffer, @buffer = @buffer, ""
|
|
+ @pending_buffer, @scanner.string = @scanner.rest, ""
|
|
@pending_buffer.force_encoding(@encoding)
|
|
super
|
|
end
|
|
--
|
|
2.40.0
|