From e3468fb3c4f9dbe217892e6648c65e69505d937a Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 14:44:07 +0100 Subject: [PATCH 1/6] isolating the punycode module --- .rubocop.yml | 1 + .simplecov | 1 + lib/httpx.rb | 1 + lib/httpx/domain_name.rb | 290 -------------------------------------- lib/httpx/punycode.rb | 291 +++++++++++++++++++++++++++++++++++++++ 5 files changed, 294 insertions(+), 290 deletions(-) create mode 100644 lib/httpx/punycode.rb diff --git a/.rubocop.yml b/.rubocop.yml index a89d5d95..76361070 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -19,6 +19,7 @@ AllCops: - 'vendor/**/*' - 'www/**/*' - 'lib/httpx/extensions.rb' + - 'lib/httpx/punycode.rb' # Do not lint ffi block, for openssl parity - 'lib/httpx/io/tls/*.rb' diff --git a/.simplecov b/.simplecov index fb01891a..3b4bd691 100644 --- a/.simplecov +++ b/.simplecov @@ -10,4 +10,5 @@ SimpleCov.start do add_filter "/lib/httpx/plugins/multipart/mime_type_detector.rb" add_filter "/lib/httpx/io/tls/" add_filter "/lib/httpx/io/tls.rb" + add_filter "/lib/httpx/punycode.rb" end diff --git a/lib/httpx.rb b/lib/httpx.rb index b57602d0..3abdaf99 100644 --- a/lib/httpx.rb +++ b/lib/httpx.rb @@ -6,6 +6,7 @@ require "httpx/extensions" require "httpx/errors" require "httpx/utils" +require "httpx/punycode" require "httpx/domain_name" require "httpx/altsvc" require "httpx/callbacks" diff --git a/lib/httpx/domain_name.rb b/lib/httpx/domain_name.rb index 4767b54f..e58c5a4d 100644 --- a/lib/httpx/domain_name.rb +++ b/lib/httpx/domain_name.rb @@ -144,295 +144,5 @@ module HTTPX 1 end end - - # :nocov: - # rubocop:disable all - # -*- coding: utf-8 -*- - #-- - # punycode.rb - PunyCode encoder for the Domain Name library - # - # Copyright (C) 2011-2017 Akinori MUSHA, All rights reserved. - # - # Ported from puny.c, a part of VeriSign XCode (encode/decode) IDN - # Library. - # - # Copyright (C) 2000-2002 Verisign Inc., All rights reserved. - # - # Redistribution and use in source and binary forms, with or - # without modification, are permitted provided that the following - # conditions are met: - # - # 1) Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # - # 2) Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in - # the documentation and/or other materials provided with the - # distribution. - # - # 3) Neither the name of the VeriSign Inc. nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - # POSSIBILITY OF SUCH DAMAGE. - # - # This software is licensed under the BSD open source license. For more - # information visit www.opensource.org. - # - # Authors: - # John Colosi (VeriSign) - # Srikanth Veeramachaneni (VeriSign) - # Nagesh Chigurupati (Verisign) - # Praveen Srinivasan(Verisign) - #++ - module Punycode - BASE = 36 - TMIN = 1 - TMAX = 26 - SKEW = 38 - DAMP = 700 - INITIAL_BIAS = 72 - INITIAL_N = 0x80 - DELIMITER = "-" - - MAXINT = (1 << 32) - 1 - - LOBASE = BASE - TMIN - CUTOFF = LOBASE * TMAX / 2 - - RE_NONBASIC = /[^\x00-\x7f]/.freeze - - # Returns the numeric value of a basic code point (for use in - # representing integers) in the range 0 to base-1, or nil if cp - # is does not represent a value. - DECODE_DIGIT = {}.tap do |map| - # ASCII A..Z map to 0..25 - # ASCII a..z map to 0..25 - (0..25).each { |i| map[65 + i] = map[97 + i] = i } - # ASCII 0..9 map to 26..35 - (26..35).each { |i| map[22 + i] = i } - end - - # Returns the basic code point whose value (when used for - # representing integers) is d, which must be in the range 0 to - # BASE-1. The lowercase form is used unless flag is true, in - # which case the uppercase form is used. The behavior is - # undefined if flag is nonzero and digit d has no uppercase - # form. - ENCODE_DIGIT = proc { |d, flag| - (d + 22 + (d < 26 ? 75 : 0) - (flag ? (1 << 5) : 0)).chr - # 0..25 map to ASCII a..z or A..Z - # 26..35 map to ASCII 0..9 - } - - DOT = "." - PREFIX = "xn--" - - # Most errors we raise are basically kind of ArgumentError. - class ArgumentError < ::ArgumentError; end - class BufferOverflowError < ArgumentError; end - - class << self - # Encode a +string+ in Punycode - def encode(string) - input = string.unpack("U*") - output = +"" - - # Initialize the state - n = INITIAL_N - delta = 0 - bias = INITIAL_BIAS - - # Handle the basic code points - input.each { |cp| output << cp.chr if cp < 0x80 } - - h = b = output.length - - # h is the number of code points that have been handled, b is the - # number of basic code points, and out is the number of characters - # that have been output. - - output << DELIMITER if b > 0 - - # Main encoding loop - - while h < input.length - # All non-basic code points < n have been handled already. Find - # the next larger one - - m = MAXINT - input.each do |cp| - m = cp if (n...m) === cp - end - - # Increase delta enough to advance the decoder's state to - # , but guard against overflow - - delta += (m - n) * (h + 1) - raise BufferOverflowError if delta > MAXINT - - n = m - - input.each do |cp| - # AMC-ACE-Z can use this simplified version instead - if cp < n - delta += 1 - raise BufferOverflowError if delta > MAXINT - elsif cp == n - # Represent delta as a generalized variable-length integer - q = delta - k = BASE - loop do - t = k <= bias ? TMIN : k - bias >= TMAX ? TMAX : k - bias - break if q < t - - q, r = (q - t).divmod(BASE - t) - output << ENCODE_DIGIT[t + r, false] - k += BASE - end - - output << ENCODE_DIGIT[q, false] - - # Adapt the bias - delta = h == b ? delta / DAMP : delta >> 1 - delta += delta / (h + 1) - bias = 0 - while delta > CUTOFF - delta /= LOBASE - bias += BASE - end - bias += (LOBASE + 1) * delta / (delta + SKEW) - - delta = 0 - h += 1 - end - end - - delta += 1 - n += 1 - end - - output - end - - # Encode a hostname using IDN/Punycode algorithms - def encode_hostname(hostname) - hostname.match(RE_NONBASIC) || (return hostname) - - hostname.split(DOT).map do |name| - if name.match(RE_NONBASIC) - PREFIX + encode(name) - else - name - end - end.join(DOT) - end - - # Decode a +string+ encoded in Punycode - def decode(string) - # Initialize the state - n = INITIAL_N - i = 0 - bias = INITIAL_BIAS - - if j = string.rindex(DELIMITER) - b = string[0...j] - - b.match(RE_NONBASIC) && - raise(ArgumentError, "Illegal character is found in basic part: #{string.inspect}") - - # Handle the basic code points - - output = b.unpack("U*") - u = string[(j + 1)..-1] - else - output = [] - u = string - end - - # Main decoding loop: Start just after the last delimiter if any - # basic code points were copied; start at the beginning - # otherwise. - - input = u.unpack("C*") - input_length = input.length - h = 0 - out = output.length - - while h < input_length - # Decode a generalized variable-length integer into delta, - # which gets added to i. The overflow checking is easier - # if we increase i as we go, then subtract off its starting - # value at the end to obtain delta. - - oldi = i - w = 1 - k = BASE - - loop do - (digit = DECODE_DIGIT[input[h]]) || - raise(ArgumentError, "Illegal character is found in non-basic part: #{string.inspect}") - h += 1 - i += digit * w - raise BufferOverflowError if i > MAXINT - - t = k <= bias ? TMIN : k - bias >= TMAX ? TMAX : k - bias - break if digit < t - - w *= BASE - t - raise BufferOverflowError if w > MAXINT - - k += BASE - (h < input_length) || raise(ArgumentError, "Malformed input given: #{string.inspect}") - end - - # Adapt the bias - delta = oldi == 0 ? i / DAMP : (i - oldi) >> 1 - delta += delta / (out + 1) - bias = 0 - while delta > CUTOFF - delta /= LOBASE - bias += BASE - end - bias += (LOBASE + 1) * delta / (delta + SKEW) - - # i was supposed to wrap around from out+1 to 0, incrementing - # n each time, so we'll fix that now: - - q, i = i.divmod(out + 1) - n += q - raise BufferOverflowError if n > MAXINT - - # Insert n at position i of the output: - - output[i, 0] = n - - out += 1 - i += 1 - end - output.pack("U*") - end - - # Decode a hostname using IDN/Punycode algorithms - def decode_hostname(hostname) - hostname.gsub(/(\A|#{Regexp.quote(DOT)})#{Regexp.quote(PREFIX)}([^#{Regexp.quote(DOT)}]*)/o) do - Regexp.last_match(1) << decode(Regexp.last_match(2)) - end - end - end - # rubocop:enable all - # :nocov: - end end end diff --git a/lib/httpx/punycode.rb b/lib/httpx/punycode.rb new file mode 100644 index 00000000..7d023c6e --- /dev/null +++ b/lib/httpx/punycode.rb @@ -0,0 +1,291 @@ +# frozen_string_literal: true + +module HTTPX + # :nocov: + # -*- coding: utf-8 -*- + #-- + # punycode.rb - PunyCode encoder for the Domain Name library + # + # Copyright (C) 2011-2017 Akinori MUSHA, All rights reserved. + # + # Ported from puny.c, a part of VeriSign XCode (encode/decode) IDN + # Library. + # + # Copyright (C) 2000-2002 Verisign Inc., All rights reserved. + # + # Redistribution and use in source and binary forms, with or + # without modification, are permitted provided that the following + # conditions are met: + # + # 1) Redistributions of source code must retain the above copyright + # notice, this list of conditions and the following disclaimer. + # + # 2) Redistributions in binary form must reproduce the above copyright + # notice, this list of conditions and the following disclaimer in + # the documentation and/or other materials provided with the + # distribution. + # + # 3) Neither the name of the VeriSign Inc. nor the names of its + # contributors may be used to endorse or promote products derived + # from this software without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + # POSSIBILITY OF SUCH DAMAGE. + # + # This software is licensed under the BSD open source license. For more + # information visit www.opensource.org. + # + # Authors: + # John Colosi (VeriSign) + # Srikanth Veeramachaneni (VeriSign) + # Nagesh Chigurupati (Verisign) + # Praveen Srinivasan(Verisign) + #++ + module Punycode + BASE = 36 + TMIN = 1 + TMAX = 26 + SKEW = 38 + DAMP = 700 + INITIAL_BIAS = 72 + INITIAL_N = 0x80 + DELIMITER = "-" + + MAXINT = (1 << 32) - 1 + + LOBASE = BASE - TMIN + CUTOFF = LOBASE * TMAX / 2 + + RE_NONBASIC = /[^\x00-\x7f]/.freeze + + # Returns the numeric value of a basic code point (for use in + # representing integers) in the range 0 to base-1, or nil if cp + # is does not represent a value. + DECODE_DIGIT = {}.tap do |map| + # ASCII A..Z map to 0..25 + # ASCII a..z map to 0..25 + (0..25).each { |i| map[65 + i] = map[97 + i] = i } + # ASCII 0..9 map to 26..35 + (26..35).each { |i| map[22 + i] = i } + end + + # Returns the basic code point whose value (when used for + # representing integers) is d, which must be in the range 0 to + # BASE-1. The lowercase form is used unless flag is true, in + # which case the uppercase form is used. The behavior is + # undefined if flag is nonzero and digit d has no uppercase + # form. + ENCODE_DIGIT = proc { |d, flag| + (d + 22 + (d < 26 ? 75 : 0) - (flag ? (1 << 5) : 0)).chr + # 0..25 map to ASCII a..z or A..Z + # 26..35 map to ASCII 0..9 + } + + DOT = "." + PREFIX = "xn--" + + # Most errors we raise are basically kind of ArgumentError. + class ArgumentError < ::ArgumentError; end + class BufferOverflowError < ArgumentError; end + + class << self + # Encode a +string+ in Punycode + def encode(string) + input = string.unpack("U*") + output = +"" + + # Initialize the state + n = INITIAL_N + delta = 0 + bias = INITIAL_BIAS + + # Handle the basic code points + input.each { |cp| output << cp.chr if cp < 0x80 } + + h = b = output.length + + # h is the number of code points that have been handled, b is the + # number of basic code points, and out is the number of characters + # that have been output. + + output << DELIMITER if b > 0 + + # Main encoding loop + + while h < input.length + # All non-basic code points < n have been handled already. Find + # the next larger one + + m = MAXINT + input.each do |cp| + m = cp if (n...m) === cp + end + + # Increase delta enough to advance the decoder's state to + # , but guard against overflow + + delta += (m - n) * (h + 1) + raise BufferOverflowError if delta > MAXINT + + n = m + + input.each do |cp| + # AMC-ACE-Z can use this simplified version instead + if cp < n + delta += 1 + raise BufferOverflowError if delta > MAXINT + elsif cp == n + # Represent delta as a generalized variable-length integer + q = delta + k = BASE + loop do + t = k <= bias ? TMIN : k - bias >= TMAX ? TMAX : k - bias + break if q < t + + q, r = (q - t).divmod(BASE - t) + output << ENCODE_DIGIT[t + r, false] + k += BASE + end + + output << ENCODE_DIGIT[q, false] + + # Adapt the bias + delta = h == b ? delta / DAMP : delta >> 1 + delta += delta / (h + 1) + bias = 0 + while delta > CUTOFF + delta /= LOBASE + bias += BASE + end + bias += (LOBASE + 1) * delta / (delta + SKEW) + + delta = 0 + h += 1 + end + end + + delta += 1 + n += 1 + end + + output + end + + # Encode a hostname using IDN/Punycode algorithms + def encode_hostname(hostname) + hostname.match(RE_NONBASIC) || (return hostname) + + hostname.split(DOT).map do |name| + if name.match(RE_NONBASIC) + PREFIX + encode(name) + else + name + end + end.join(DOT) + end + + # Decode a +string+ encoded in Punycode + def decode(string) + # Initialize the state + n = INITIAL_N + i = 0 + bias = INITIAL_BIAS + + if j = string.rindex(DELIMITER) + b = string[0...j] + + b.match(RE_NONBASIC) && + raise(ArgumentError, "Illegal character is found in basic part: #{string.inspect}") + + # Handle the basic code points + + output = b.unpack("U*") + u = string[(j + 1)..-1] + else + output = [] + u = string + end + + # Main decoding loop: Start just after the last delimiter if any + # basic code points were copied; start at the beginning + # otherwise. + + input = u.unpack("C*") + input_length = input.length + h = 0 + out = output.length + + while h < input_length + # Decode a generalized variable-length integer into delta, + # which gets added to i. The overflow checking is easier + # if we increase i as we go, then subtract off its starting + # value at the end to obtain delta. + + oldi = i + w = 1 + k = BASE + + loop do + (digit = DECODE_DIGIT[input[h]]) || + raise(ArgumentError, "Illegal character is found in non-basic part: #{string.inspect}") + h += 1 + i += digit * w + raise BufferOverflowError if i > MAXINT + + t = k <= bias ? TMIN : k - bias >= TMAX ? TMAX : k - bias + break if digit < t + + w *= BASE - t + raise BufferOverflowError if w > MAXINT + + k += BASE + (h < input_length) || raise(ArgumentError, "Malformed input given: #{string.inspect}") + end + + # Adapt the bias + delta = oldi == 0 ? i / DAMP : (i - oldi) >> 1 + delta += delta / (out + 1) + bias = 0 + while delta > CUTOFF + delta /= LOBASE + bias += BASE + end + bias += (LOBASE + 1) * delta / (delta + SKEW) + + # i was supposed to wrap around from out+1 to 0, incrementing + # n each time, so we'll fix that now: + + q, i = i.divmod(out + 1) + n += q + raise BufferOverflowError if n > MAXINT + + # Insert n at position i of the output: + + output[i, 0] = n + + out += 1 + i += 1 + end + output.pack("U*") + end + + # Decode a hostname using IDN/Punycode algorithms + def decode_hostname(hostname) + hostname.gsub(/(\A|#{Regexp.quote(DOT)})#{Regexp.quote(PREFIX)}([^#{Regexp.quote(DOT)}]*)/o) do + Regexp.last_match(1) << decode(Regexp.last_match(2)) + end + end + end + # :nocov: + end +end \ No newline at end of file From 5d64f93ec4366cbaa9598a45877a84f92ffe3c48 Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 14:45:23 +0100 Subject: [PATCH 2/6] isolating idn test --- test/support/requests/get.rb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/test/support/requests/get.rb b/test/support/requests/get.rb index be77d4e1..71137399 100644 --- a/test/support/requests/get.rb +++ b/test/support/requests/get.rb @@ -94,11 +94,13 @@ module Requests response.close end - def test_get_non_ascii + def test_get_idn response = HTTPX.get("http://bücher.ch") verify_status(response, 200) response.close + end unless RUBY_VERSION < "2.3" + def test_get_non_ascii response = HTTPX.get(build_uri("/get?q=ã")) verify_status(response, 200) response.close From 4df2363cc5b5ade0147b35f6075cb06001add07f Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 14:46:14 +0100 Subject: [PATCH 3/6] providing strategy for punycode translation, using idnx when possible, falling back otherwise --- Gemfile | 1 + lib/httpx/connection.rb | 2 +- lib/httpx/punycode.rb | 195 +++++++++++++++++++++------------------ lib/httpx/request.rb | 2 +- lib/httpx/utils.rb | 10 +- sig/utils.rbs | 7 ++ test/support/ci/build.sh | 6 +- 7 files changed, 123 insertions(+), 100 deletions(-) create mode 100644 sig/utils.rbs diff --git a/Gemfile b/Gemfile index 87bdaeb8..cba0e4fe 100644 --- a/Gemfile +++ b/Gemfile @@ -66,6 +66,7 @@ group :test do gem "aws-sdk-s3" gem "faraday" + gem "idnx" if RUBY_VERSION >= "2.4.0" gem "oga" if RUBY_VERSION >= "3.0.0" diff --git a/lib/httpx/connection.rb b/lib/httpx/connection.rb index 63f6a0c3..24175900 100644 --- a/lib/httpx/connection.rb +++ b/lib/httpx/connection.rb @@ -51,7 +51,7 @@ module HTTPX def initialize(type, uri, options) @type = type @origins = [uri.origin] - @origin = Utils.uri(uri.origin) + @origin = Utils.to_uri(uri.origin) @options = Options.new(options) @window_size = @options.window_size @read_buffer = Buffer.new(BUFFER_SIZE) diff --git a/lib/httpx/punycode.rb b/lib/httpx/punycode.rb index 7d023c6e..c4946d81 100644 --- a/lib/httpx/punycode.rb +++ b/lib/httpx/punycode.rb @@ -1,104 +1,117 @@ # frozen_string_literal: true module HTTPX - # :nocov: - # -*- coding: utf-8 -*- - #-- - # punycode.rb - PunyCode encoder for the Domain Name library - # - # Copyright (C) 2011-2017 Akinori MUSHA, All rights reserved. - # - # Ported from puny.c, a part of VeriSign XCode (encode/decode) IDN - # Library. - # - # Copyright (C) 2000-2002 Verisign Inc., All rights reserved. - # - # Redistribution and use in source and binary forms, with or - # without modification, are permitted provided that the following - # conditions are met: - # - # 1) Redistributions of source code must retain the above copyright - # notice, this list of conditions and the following disclaimer. - # - # 2) Redistributions in binary form must reproduce the above copyright - # notice, this list of conditions and the following disclaimer in - # the documentation and/or other materials provided with the - # distribution. - # - # 3) Neither the name of the VeriSign Inc. nor the names of its - # contributors may be used to endorse or promote products derived - # from this software without specific prior written permission. - # - # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS - # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE - # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, - # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, - # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS - # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED - # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN - # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - # POSSIBILITY OF SUCH DAMAGE. - # - # This software is licensed under the BSD open source license. For more - # information visit www.opensource.org. - # - # Authors: - # John Colosi (VeriSign) - # Srikanth Veeramachaneni (VeriSign) - # Nagesh Chigurupati (Verisign) - # Praveen Srinivasan(Verisign) - #++ - module Punycode - BASE = 36 - TMIN = 1 - TMAX = 26 - SKEW = 38 - DAMP = 700 - INITIAL_BIAS = 72 - INITIAL_N = 0x80 - DELIMITER = "-" + begin + require "idnx" - MAXINT = (1 << 32) - 1 + module Punycode + module_function - LOBASE = BASE - TMIN - CUTOFF = LOBASE * TMAX / 2 - - RE_NONBASIC = /[^\x00-\x7f]/.freeze - - # Returns the numeric value of a basic code point (for use in - # representing integers) in the range 0 to base-1, or nil if cp - # is does not represent a value. - DECODE_DIGIT = {}.tap do |map| - # ASCII A..Z map to 0..25 - # ASCII a..z map to 0..25 - (0..25).each { |i| map[65 + i] = map[97 + i] = i } - # ASCII 0..9 map to 26..35 - (26..35).each { |i| map[22 + i] = i } + def encode_hostname(hostname) + Idnx.to_punycode(hostname) + end end - # Returns the basic code point whose value (when used for - # representing integers) is d, which must be in the range 0 to - # BASE-1. The lowercase form is used unless flag is true, in - # which case the uppercase form is used. The behavior is - # undefined if flag is nonzero and digit d has no uppercase - # form. - ENCODE_DIGIT = proc { |d, flag| - (d + 22 + (d < 26 ? 75 : 0) - (flag ? (1 << 5) : 0)).chr - # 0..25 map to ASCII a..z or A..Z - # 26..35 map to ASCII 0..9 - } + rescue LoadError + # :nocov: + # -*- coding: utf-8 -*- + #-- + # punycode.rb - PunyCode encoder for the Domain Name library + # + # Copyright (C) 2011-2017 Akinori MUSHA, All rights reserved. + # + # Ported from puny.c, a part of VeriSign XCode (encode/decode) IDN + # Library. + # + # Copyright (C) 2000-2002 Verisign Inc., All rights reserved. + # + # Redistribution and use in source and binary forms, with or + # without modification, are permitted provided that the following + # conditions are met: + # + # 1) Redistributions of source code must retain the above copyright + # notice, this list of conditions and the following disclaimer. + # + # 2) Redistributions in binary form must reproduce the above copyright + # notice, this list of conditions and the following disclaimer in + # the documentation and/or other materials provided with the + # distribution. + # + # 3) Neither the name of the VeriSign Inc. nor the names of its + # contributors may be used to endorse or promote products derived + # from this software without specific prior written permission. + # + # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + # "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + # LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS + # FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE + # COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, + # INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, + # BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS + # OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED + # AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN + # ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + # POSSIBILITY OF SUCH DAMAGE. + # + # This software is licensed under the BSD open source license. For more + # information visit www.opensource.org. + # + # Authors: + # John Colosi (VeriSign) + # Srikanth Veeramachaneni (VeriSign) + # Nagesh Chigurupati (Verisign) + # Praveen Srinivasan(Verisign) + #++ + module Punycode + BASE = 36 + TMIN = 1 + TMAX = 26 + SKEW = 38 + DAMP = 700 + INITIAL_BIAS = 72 + INITIAL_N = 0x80 + DELIMITER = "-" - DOT = "." - PREFIX = "xn--" + MAXINT = (1 << 32) - 1 - # Most errors we raise are basically kind of ArgumentError. - class ArgumentError < ::ArgumentError; end - class BufferOverflowError < ArgumentError; end + LOBASE = BASE - TMIN + CUTOFF = LOBASE * TMAX / 2 + + RE_NONBASIC = /[^\x00-\x7f]/.freeze + + # Returns the numeric value of a basic code point (for use in + # representing integers) in the range 0 to base-1, or nil if cp + # is does not represent a value. + DECODE_DIGIT = {}.tap do |map| + # ASCII A..Z map to 0..25 + # ASCII a..z map to 0..25 + (0..25).each { |i| map[65 + i] = map[97 + i] = i } + # ASCII 0..9 map to 26..35 + (26..35).each { |i| map[22 + i] = i } + end + + # Returns the basic code point whose value (when used for + # representing integers) is d, which must be in the range 0 to + # BASE-1. The lowercase form is used unless flag is true, in + # which case the uppercase form is used. The behavior is + # undefined if flag is nonzero and digit d has no uppercase + # form. + ENCODE_DIGIT = proc { |d, flag| + (d + 22 + (d < 26 ? 75 : 0) - (flag ? (1 << 5) : 0)).chr + # 0..25 map to ASCII a..z or A..Z + # 26..35 map to ASCII 0..9 + } + + DOT = "." + PREFIX = "xn--" + + # Most errors we raise are basically kind of ArgumentError. + class ArgumentError < ::ArgumentError; end + class BufferOverflowError < ArgumentError; end + + module_function - class << self # Encode a +string+ in Punycode def encode(string) input = string.unpack("U*") diff --git a/lib/httpx/request.rb b/lib/httpx/request.rb index 5aca7506..63387343 100644 --- a/lib/httpx/request.rb +++ b/lib/httpx/request.rb @@ -45,7 +45,7 @@ module HTTPX def initialize(verb, uri, options = {}) @verb = verb.to_s.downcase.to_sym @options = Options.new(options) - @uri = Utils.uri(uri) + @uri = Utils.to_uri(uri) if @uri.relative? raise(Error, "invalid URI: #{@uri}") unless @options.origin diff --git a/lib/httpx/utils.rb b/lib/httpx/utils.rb index fa092fc9..ebc0b7bf 100644 --- a/lib/httpx/utils.rb +++ b/lib/httpx/utils.rb @@ -18,14 +18,16 @@ module HTTPX end if RUBY_VERSION < "2.3" - def uri(*args) - URI(*args) + + def to_uri(uri) + URI(uri) end + else URIParser = URI::RFC2396_Parser.new - def uri(uri) + def to_uri(uri) return Kernel.URI(uri) unless uri.is_a?(String) && !uri.ascii_only? uri = Kernel.URI(URIParser.escape(uri)) @@ -34,7 +36,7 @@ module HTTPX non_ascii_hostname.force_encoding(Encoding::UTF_8) - idna_hostname = DomainName.new(non_ascii_hostname).hostname + idna_hostname = Punycode.encode_hostname(non_ascii_hostname) uri.host = idna_hostname uri.non_ascii_hostname = non_ascii_hostname diff --git a/sig/utils.rbs b/sig/utils.rbs new file mode 100644 index 00000000..c359fe5f --- /dev/null +++ b/sig/utils.rbs @@ -0,0 +1,7 @@ +module HTTPX + module Utils + def self?.parse_retry_after: (String) -> Numeric + + def self?.to_uri: (generic_uri uri) -> URI::Generic + end +end \ No newline at end of file diff --git a/test/support/ci/build.sh b/test/support/ci/build.sh index 89d1b67a..35f635f4 100755 --- a/test/support/ci/build.sh +++ b/test/support/ci/build.sh @@ -6,12 +6,12 @@ RUBY_PLATFORM=`ruby -e 'puts RUBY_PLATFORM'` RUBY_ENGINE=`ruby -e 'puts RUBY_ENGINE'` if [[ "$RUBY_ENGINE" = "truffleruby" ]]; then - microdnf install -y iptables iproute which file + microdnf install -y iptables iproute which file idn2 elif [[ "$RUBY_PLATFORM" = "java" ]]; then echo " deb http://deb.debian.org/debian sid main contrib non-free deb-src http://deb.debian.org/debian sid main contrib non-free" >> /etc/apt/sources.list - apt-get update && apt-get install -y iptables openssl libssl-dev ca-certificates file + apt-get update && apt-get install -y iptables openssl libssl-dev ca-certificates file idn2 update-ca-certificates elif [[ ${RUBY_VERSION:0:3} = "2.1" ]]; then apt-get update && apt-get install -y libsodium-dev iptables @@ -23,7 +23,7 @@ elif [[ ${RUBY_VERSION:0:3} = "2.3" ]]; then wget http://deb.debian.org/debian/pool/main/o/openssl1.0/libssl1.0-dev_1.0.2u-1~deb9u1_amd64.deb dpkg -i libssl1.0-dev_1.0.2u-1~deb9u1_amd64.deb else - apt-get update && apt-get install -y iptables + apt-get update && apt-get install -y iptables idn2 fi # use port 9090 to test connection timeouts From 0c7712ca8833f7aac24a0e747894f8116bc8cee3 Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 18:51:57 +0100 Subject: [PATCH 4/6] using punycode name as the authority in the headers, which is actually a bugfix --- lib/httpx/extensions.rb | 2 +- test/support/assertion_helpers.rb | 4 ++-- test/support/requests/get.rb | 6 +++++- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/httpx/extensions.rb b/lib/httpx/extensions.rb index fb0dcac0..a55aff7d 100644 --- a/lib/httpx/extensions.rb +++ b/lib/httpx/extensions.rb @@ -78,7 +78,7 @@ module HTTPX def authority port_string = port == default_port ? nil : ":#{port}" - "#{@non_ascii_hostname || host}#{port_string}" + "#{host}#{port_string}" end def origin diff --git a/test/support/assertion_helpers.rb b/test/support/assertion_helpers.rb index 491eede8..d2ff141b 100644 --- a/test/support/assertion_helpers.rb +++ b/test/support/assertion_helpers.rb @@ -17,12 +17,12 @@ module ResponseHelpers if value.respond_to?(:start_with?) assert value.start_with?(expect), "#{meth} assertion failed: \#{key}=\#{value} (expected: \#{expect}})" else - assert value == expect, "#{meth} assertion failed: \#{key}=\#{value.to_s} (expected: \#{expect.to_s})" + assert value == expect, "#{meth} assertion failed: \#{key}=\#{value.inspect} (expected: \#{expect.to_s})" end end def verify_no_#{meth}(#{meth}s, key) - assert !#{meth}s.key?(key), "#{meth}s contains the given key (" + key + ": \#{#{meth}s[key]})" + assert !#{meth}s.key?(key), "#{meth}s contains the given key (" + key + ": \#{#{meth}s[key].inspect})" end DEFINE end diff --git a/test/support/requests/get.rb b/test/support/requests/get.rb index 71137399..016a13cc 100644 --- a/test/support/requests/get.rb +++ b/test/support/requests/get.rb @@ -96,8 +96,12 @@ module Requests def test_get_idn response = HTTPX.get("http://bücher.ch") - verify_status(response, 200) + verify_status(response, 301) + verify_header(response.headers, "location", "https://www.buecher.de") + response.close + + assert response.instance_variable_get(:@request).authority == "xn--bcher-kva.ch" end unless RUBY_VERSION < "2.3" def test_get_non_ascii From 8834f79f10b8d95644105d63d64ca5cdf6b9a8f3 Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 18:57:53 +0100 Subject: [PATCH 5/6] introduction to idnx --- www/_posts/2021-06-11-introducing-idnx.md | 44 +++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 www/_posts/2021-06-11-introducing-idnx.md diff --git a/www/_posts/2021-06-11-introducing-idnx.md b/www/_posts/2021-06-11-introducing-idnx.md new file mode 100644 index 00000000..57d506a0 --- /dev/null +++ b/www/_posts/2021-06-11-introducing-idnx.md @@ -0,0 +1,44 @@ +--- +layout: post +title: Introducing idnx +--- + + +I've just published the first version of [idnx](https://github.com/HoneyryderChuck/idnx) to Rubygems. `idnx` is a ruby gem which converts Internationalized Domain Names into Punycode. The gist of it is: + +```ruby +require "idnx" + +Idnx.to_punycode("bücher.de") #=> "xn--bcher-kva.de" +``` + +That's it! That's the announcement! + +## Why yet another idn gem? + +Let me spare you the work: here's the [ruby toolbox link](https://www.ruby-toolbox.com/search?q=idn). Yes, there have been many IDN-related gems over the years. Why yet another one? + +While researching on how to better support IDN domain names for `httpx`, I asked myself, "what does cURL do?". After a session of "look at the source", I found out that cURL uses [libidn2](https://github.com/libidn/libidn2) in Unix environments, while it uses [the winAPI IdnToAscii](https://docs.microsoft.com/en-us/windows/win32/api/winnls/nf-winnls-idntoascii) on Windows. + +After that, I searched for a ruby library that would support at least one of the above. To my surprise, I didn't find any. In fact, I found out that most of the idn-related gems from that ruby toolbox list haven't received much attention for years, and most of them use [libidn](https://www.gnu.org/software/libidn/), the predecessor of `libidn2`, which does not support IDNA 2008 Punycode protocol. Also, none of them supports Windows. + +So I decided to roll my own, the cURL way: provide bindings for `libidn2`, while using Windows APIs for Windows, all via FFI, so that it'd transparently works with JRuby. + +## Why no punycode-to-idn translation? + +The short answer is: because I don't need it. If you do though, I'll welcome a Pull Request introducing it. + +## Why doesn't ruby provide this? + +I've previously [discussed in the ruby bugs board](https://bugs.ruby-lang.org/issues/17309) about the lack of support for punycode, and that breaking the "principle of least astonishment" when using standard library APIs like `uri` or `resolv`. I understand that doing so would require `ruby` to be dependent on `libidn2` (at least in Linux/BSD systems), and the core team has been pretty resistant when it comes to had more dependencies to the runtime. I understand that this'll never happen, unless someone makes a convincing argument that satisfies the core team. + +Until then, you can use this gem, which, in case the day will come, can hopefully work as a template. + +## Will I need idnx to use httpx? + +No. `idnx` will be a "weak" dependency, i.e. you'll have to install it yourself, and `httpx` will hook on it if available. It'll otherwise fallback to a [pure ruby punycode implementation imported from another ruby gem](https://gitlab.com/honeyryderchuck/httpx/-/blob/master/lib/httpx/punycode.rb) (it doesn't support IDNA2008 however, so make sure to use `idnx` if you require it). + + +---- + +That's it. Happy hacking! \ No newline at end of file From 44724a1edb7633a418b3f89eff0953110e63b733 Mon Sep 17 00:00:00 2001 From: HoneyryderChuck Date: Fri, 11 Jun 2021 18:58:04 +0100 Subject: [PATCH 6/6] fix docs --- www/_data/plugins.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/www/_data/plugins.yml b/www/_data/plugins.yml index 8501b5ae..46962686 100644 --- a/www/_data/plugins.yml +++ b/www/_data/plugins.yml @@ -12,7 +12,7 @@ description: API and support for NTLM Authentication. - name: AwsSigV4Authentication - path: AWS-SigV4.html + path: AWS-SigV4 description: API and support for AWS SigV4 Authentication. - name: Compression