From: Tim Serong Date: Fri, 1 May 2015 15:59:53 +0000 (+1000) Subject: json_spirit: use utf8 intenally when parsing \uHHHH X-Git-Tag: v0.80.11~80^2~1 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=84b00f189554fc37413c990ac4011079bb5cdb60;p=ceph.git json_spirit: use utf8 intenally when parsing \uHHHH When the python CLI is given non-ASCII characters, it converts them to \uHHHH escapes in JSON. json_spirit parses these internally into 16 bit characters, which could only work if json_spirit were built to use std::wstring, which it isn't; it's using std::string, so the high byte ends up being zero'd, leaving the low byte which is effectively garbage. This hack^H^H^H^H change makes json_spirit convert to utf8 internally instead, which can be stored just fine inside a std::string. Note that this implementation still assumes \uHHHH escapes are four hex digits, so it'll only cope with characters in the Basic Multilingual Plane. Still, that's rather a lot more characters than it could cope with before ;) (For characters outside the BMP, Python seems to generate escapes in the form \uHHHHHHHH, i.e. 8 hex digits, which the current implementation doesn't expect to see) Fixes: #7387 Signed-off-by: Tim Serong (cherry picked from commit 8add15b86e7aaef41397ab8fa9e77ee7957eb607) Conflicts: src/test/mon/osd-pool-create.sh Changed $CEPH_MON to 127.0.0.1 -- the CEPH_MON was introduced after firefly to allow tests to run in parallel. Back in firefly all tests use the same port because 127.0.0.1 was hardcoded. We can't conveniently backport all that's necessary for tests to run in parallel, therefore we keep the 127.0.0.1 hardcoded. --- diff --git a/src/json_spirit/json_spirit_reader_template.h b/src/json_spirit/json_spirit_reader_template.h index f87b59331b7..2eaf743efae 100644 --- a/src/json_spirit/json_spirit_reader_template.h +++ b/src/json_spirit/json_spirit_reader_template.h @@ -13,6 +13,8 @@ #include "json_spirit_value.h" #include "json_spirit_error_position.h" +#include "common/utf8.h" + #define BOOST_SPIRIT_THREADSAFE // uncomment for multithreaded use, requires linking to boost.thread #include @@ -71,18 +73,30 @@ namespace json_spirit return ( hex_to_num( c1 ) << 4 ) + hex_to_num( c2 ); } - template< class Char_type, class Iter_type > - Char_type unicode_str_to_char( Iter_type& begin ) + template< class String_type, class Iter_type > + String_type unicode_str_to_utf8( Iter_type& begin ); + + template<> + std::string unicode_str_to_utf8( std::string::const_iterator & begin ) { + typedef typename std::string::value_type Char_type; + const Char_type c1( *( ++begin ) ); const Char_type c2( *( ++begin ) ); const Char_type c3( *( ++begin ) ); const Char_type c4( *( ++begin ) ); - return ( hex_to_num( c1 ) << 12 ) + - ( hex_to_num( c2 ) << 8 ) + - ( hex_to_num( c3 ) << 4 ) + - hex_to_num( c4 ); + unsigned long uc = ( hex_to_num( c1 ) << 12 ) + + ( hex_to_num( c2 ) << 8 ) + + ( hex_to_num( c3 ) << 4 ) + + hex_to_num( c4 ); + + unsigned char buf[7]; // MAX_UTF8_SZ is 6 (see src/common/utf8.c) + int r = encode_utf8(uc, buf); + if (r >= 0) { + return std::string(reinterpret_cast(buf), r); + } + return std::string("_"); } template< class String_type > @@ -116,7 +130,7 @@ namespace json_spirit { if( end - begin >= 5 ) // expecting "uHHHH..." { - s += unicode_str_to_char< Char_type >( begin ); + s += unicode_str_to_utf8< String_type >( begin ); } break; } @@ -178,11 +192,15 @@ namespace json_spirit return get_str_< std::string >( begin, end ); } +// Need this guard else it tries to instantiate unicode_str_to_utf8 with a +// std::wstring, which isn't presently implemented +#if defined( JSON_SPIRIT_WMVALUE_ENABLED ) && !defined( BOOST_NO_STD_WSTRING ) inline std::wstring get_str( std::wstring::const_iterator begin, std::wstring::const_iterator end ) { return get_str_< std::wstring >( begin, end ); } - +#endif + template< class String_type, class Iter_type > String_type get_str( Iter_type begin, Iter_type end ) { diff --git a/src/test/mon/osd-pool-create.sh b/src/test/mon/osd-pool-create.sh index 0e4811b41d7..ef18eb57bd2 100755 --- a/src/test/mon/osd-pool-create.sh +++ b/src/test/mon/osd-pool-create.sh @@ -228,6 +228,21 @@ function TEST_replicated_pool() { grep 'cannot change to type erasure' || return 1 } +function TEST_utf8_cli() { + local dir=$1 + run_mon $dir a --public-addr 127.0.0.1 + # Hopefully it's safe to include literal UTF-8 characters to test + # the fix for http://tracker.ceph.com/issues/7387. If it turns out + # to not be OK (when is the default encoding *not* UTF-8?), maybe + # the character '黄' can be replaced with the escape $'\xe9\xbb\x84' + ./ceph osd pool create 黄 1024 2>&1 | \ + grep "pool '黄' created" || return 1 + ./ceph osd lspools 2>&1 | \ + grep "黄" || return 1 + ./ceph -f json-pretty osd dump | \ + python -c "import json; import sys; json.load(sys.stdin)" || return 1 + ./ceph osd pool delete 黄 黄 --yes-i-really-really-mean-it +} main osd-pool-create