From 5f14fd96fed2a7d1072c28ede66988e7ba3b67ea Mon Sep 17 00:00:00 2001 From: Colin Patrick McCabe Date: Tue, 22 Mar 2011 07:20:58 -0700 Subject: [PATCH] common: Add utf8 validation functions, test Add utility functions for validating a buffer as valid UTF-8. Add a unit test to check the functions. Signed-off-by: Colin McCabe --- src/Makefile.am | 8 +++ src/common/utf8.c | 166 ++++++++++++++++++++++++++++++++++++++++++++++ src/common/utf8.h | 38 +++++++++++ src/test/utf8.cc | 49 ++++++++++++++ 4 files changed, 261 insertions(+) create mode 100644 src/common/utf8.c create mode 100644 src/common/utf8.h create mode 100644 src/test/utf8.cc diff --git a/src/Makefile.am b/src/Makefile.am index 52367644a1099..7e216ef3f3182 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -420,6 +420,12 @@ unittest_ceph_crypto_LDADD = libcommon.a ${CRYPTO_LIBS} \ unittest_ceph_crypto_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} check_PROGRAMS += unittest_ceph_crypto +unittest_utf8_SOURCES = test/utf8.cc +unittest_utf8_LDFLAGS = -pthread ${AM_LDFLAGS} +unittest_utf8_LDADD = libcommon.a ${UNITTEST_LDADD} +unittest_utf8_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS} +check_PROGRAMS += unittest_utf8 + # shell scripts editpaths = sed \ -e 's|@bindir[@]|$(bindir)|g' \ @@ -585,6 +591,7 @@ libcommon_files = \ include/ceph_strings.cc \ include/ceph_frag.cc \ common/config.cc \ + common/utf8.c \ common/page.cc \ common/lockdep.cc \ common/DoutStreambuf.cc \ @@ -751,6 +758,7 @@ noinst_HEADERS = \ common/safe_io.h\ common/config.h\ common/ceph_crypto.h\ + common/utf8.h\ crush/CrushWrapper.h\ crush/CrushWrapper.i\ crush/builder.h\ diff --git a/src/common/utf8.c b/src/common/utf8.c new file mode 100644 index 0000000000000..11957b3143564 --- /dev/null +++ b/src/common/utf8.c @@ -0,0 +1,166 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "common/utf8.h" + +#include +#include + +#define MAX_UTF8_SZ 6 +#define INVALID_UTF8_CHAR 0xfffffffful + +static int high_bits_set(int c) +{ + int ret = 0; + while (1) { + if ((c & 0x80) != 0x080) + break; + c <<= 1; + ++ret; + } + return ret; +} + +/* Encode a 31-bit UTF8 code point to 'buf'. + * Assumes buf is of size MAX_UTF8_SZ + * Returns -1 on failure; number of bytes in the encoded value otherwise. + */ +static int encode_utf8(unsigned long u, unsigned char *buf) +{ + int i; + unsigned long max_val[MAX_UTF8_SZ] = { + 0x0000007ful, 0x000007fful, 0x0000fffful, + 0x001ffffful, 0x03fffffful, 0x7ffffffful + }; + static const int MAX_VAL_SZ = sizeof(max_val) / sizeof(max_val[0]); + + for (i = 0; i < MAX_VAL_SZ; ++i) { + if (u <= max_val[i]) + break; + } + if (i == MAX_VAL_SZ) { + // This code point is too big to encode. + return -1; + } + + if (i == 0) { + buf[0] = u; + } + else { + signed int j; + for (j = i; j > 0; --j) { + buf[j] = 0x80 | (u & 0x3f); + u >>= 6; + } + + unsigned char mask = ~(0xFF >> (i + 1)); + buf[0] = mask | u; + } + + return i + 1; +} + +/* + * Decode a UTF8 character from an array of bytes. Return character code. + * Upon error, return INVALID_UTF8_CHAR. + */ +static unsigned long decode_utf8(unsigned char *buf, int nbytes) +{ + unsigned long code; + int i, j; + + if (nbytes <= 0) + return INVALID_UTF8_CHAR; + + if (nbytes == 1) { + if (buf[0] >= 0x80) + return INVALID_UTF8_CHAR; + return buf[0]; + } + + i = high_bits_set(buf[0]); + if (i != nbytes) + return INVALID_UTF8_CHAR; + code = buf[0] & (0xff >> i); + for (j = 1; j < nbytes; ++j) { + if ((buf[j] & 0xc0) != 0x80) + return INVALID_UTF8_CHAR; + code = (code << 6) | (buf[j] & 0x3f); + } + + // Check for invalid code points + if (code == 0xFFFE) + return INVALID_UTF8_CHAR; + if (code == 0xFFFF) + return INVALID_UTF8_CHAR; + if (code >= 0xD800 && code <= 0xDFFF) + return INVALID_UTF8_CHAR; + + return code; +} + +int check_utf8(const char *buf, int len) +{ + unsigned char u[MAX_UTF8_SZ]; + int enc_len = 0; + int i = 0; + while (1) { + unsigned int c = buf[i]; + if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) { + // the start of a new character. Process what we have + // in the buffer. + if (enc_len > 0) { + int re_encoded_len; + unsigned char re_encoded[MAX_UTF8_SZ]; + unsigned long code = decode_utf8(u, enc_len); + if (code == INVALID_UTF8_CHAR) { + //printf("decoded to invalid utf8"); + return i + 1; + } + re_encoded_len = encode_utf8(code, re_encoded); + if (enc_len != re_encoded_len) { + //printf("originally encoded as %d bytes, " + // "but was re-encoded to %d!\n", + // enc_len, re_encoded_len); + return i + 1; + } + if (memcmp(u, re_encoded, enc_len) != 0) { + //printf("re-encoded to a different " + // "byte stream!"); + return i + 1; + } + //printf("code_point %lu\n", code); + } + enc_len = 0; + if (i >= len) + break; + // start collecting again? + if (c >= 0x80) + u[enc_len++] = c; + } else { + if (enc_len == MAX_UTF8_SZ) { + //printf("too many enc_len in utf character!\n"); + return i + 1; + } + //printf("continuation byte...\n"); + u[enc_len++] = c; + } + ++i; + } + return 0; +} + +int check_utf8_cstr(const char *buf) +{ + return check_utf8(buf, strlen(buf)); +} diff --git a/src/common/utf8.h b/src/common/utf8.h new file mode 100644 index 0000000000000..e1c891cef5926 --- /dev/null +++ b/src/common/utf8.h @@ -0,0 +1,38 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#ifndef CEPH_COMMON_UTF8_H +#define CEPH_COMMON_UTF8_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* Checks if a buffer is valid UTF-8. + * Returns 0 if it is, and one plus the offset of the first invalid byte + * if it is not. + */ +int check_utf8(const char *buf, int len); + +/* Checks if a null-terminated string is valid UTF-8. + * Returns 0 if it is, and one plus the offset of the first invalid byte + * if it is not. + */ +int check_utf8_cstr(const char *buf); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/src/test/utf8.cc b/src/test/utf8.cc new file mode 100644 index 0000000000000..7c1a4de24eb64 --- /dev/null +++ b/src/test/utf8.cc @@ -0,0 +1,49 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2011 New Dream Network + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ +#include "common/utf8.h" +#include "gtest/gtest.h" +#include + +TEST(IsValidUtf8, SimpleAscii) { + ASSERT_EQ(0, check_utf8_cstr("Ascii ftw.")); + ASSERT_EQ(0, check_utf8_cstr("")); + ASSERT_EQ(0, check_utf8_cstr("B")); + ASSERT_EQ(0, check_utf8_cstr("Badgers badgers badgers badgers " + "mushroom mushroom")); + ASSERT_EQ(0, check_utf8("foo", strlen("foo"))); +} + +TEST(IsValidUtf8, ControlChars) { + // Sadly, control characters are valid utf8... + uint8_t control_chars[] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d }; + ASSERT_EQ(0, check_utf8((char*)control_chars, sizeof(control_chars))); +} + +TEST(IsValidUtf8, SimpleUtf8) { + uint8_t funkystr[] = { 0x66, 0xd1, 0x86, 0xd1, 0x9d, 0xd2, 0xa0, 0xd3, + 0xad, 0xd3, 0xae, 0x0a }; + ASSERT_EQ(0, check_utf8((char*)funkystr, sizeof(funkystr))); + + uint8_t valid2[] = { 0xc3, 0xb1 }; + ASSERT_EQ(0, check_utf8((char*)valid2, sizeof(valid2))); +} + +TEST(IsValidUtf8, InvalidUtf8) { + uint8_t inval[] = { 0xe2, 0x28, 0xa1 }; + ASSERT_NE(0, check_utf8((char*)inval, sizeof(inval))); + + uint8_t invalid2[] = { 0xc3, 0x28 }; + ASSERT_NE(0, check_utf8((char*)invalid2, sizeof(invalid2))); +} -- 2.39.5