common: Add utf8 validation functions, test

author Colin Patrick McCabe <cmccabe@alumni.cmu.edu>

Tue, 22 Mar 2011 14:20:58 +0000 (07:20 -0700)

committer Colin Patrick McCabe <cmccabe@alumni.cmu.edu>

Tue, 22 Mar 2011 14:30:47 +0000 (07:30 -0700)
author Colin Patrick McCabe <cmccabe@alumni.cmu.edu>
Tue, 22 Mar 2011 14:20:58 +0000 (07:20 -0700)
committer Colin Patrick McCabe <cmccabe@alumni.cmu.edu>
Tue, 22 Mar 2011 14:30:47 +0000 (07:30 -0700)
diff --git a/src/Makefile.am b/src/Makefile.am

index 52367644a10993ef1939053cb60c2e6491db6c3e..7e216ef3f31825e55dc7add6545548727bf60ffa 100644 (file)
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -420,6 +420,12 @@ unittest_ceph_crypto_LDADD = libcommon.a ${CRYPTO_LIBS} \
  unittest_ceph_crypto_CXXFLAGS = ${CRYPTO_CXXFLAGS} ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
  check_PROGRAMS += unittest_ceph_crypto
  
+unittest_utf8_SOURCES = test/utf8.cc
+unittest_utf8_LDFLAGS = -pthread ${AM_LDFLAGS}
+unittest_utf8_LDADD = libcommon.a ${UNITTEST_LDADD}
+unittest_utf8_CXXFLAGS = ${AM_CXXFLAGS} ${UNITTEST_CXXFLAGS}
+check_PROGRAMS += unittest_utf8
+
  # shell scripts
  editpaths = sed \
         -e 's|@bindir[@]|$(bindir)|g' \
@@ -585,6 +591,7 @@ libcommon_files = \
         include/ceph_strings.cc \
         include/ceph_frag.cc \
         common/config.cc \
+       common/utf8.c \
         common/page.cc \
         common/lockdep.cc \
         common/DoutStreambuf.cc \
@@ -751,6 +758,7 @@ noinst_HEADERS = \
         common/safe_io.h\
          common/config.h\
         common/ceph_crypto.h\
+       common/utf8.h\
          crush/CrushWrapper.h\
          crush/CrushWrapper.i\
          crush/builder.h\
diff --git a/src/common/utf8.c b/src/common/utf8.c

new file mode 100644 (file)

index 0000000..11957b3
--- /dev/null
+++ b/src/common/utf8.c
@@ -0,0 +1,166 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+
+#include <stdio.h>
+#include <string.h>
+
+#define MAX_UTF8_SZ 6
+#define INVALID_UTF8_CHAR 0xfffffffful
+
+static int high_bits_set(int c)
+{
+       int ret = 0;
+       while (1) {
+               if ((c & 0x80) != 0x080)
+                       break;
+               c <<= 1;
+               ++ret;
+       }
+       return ret;
+}
+
+/* Encode a 31-bit UTF8 code point to 'buf'.
+ * Assumes buf is of size MAX_UTF8_SZ
+ * Returns -1 on failure; number of bytes in the encoded value otherwise.
+ */
+static int encode_utf8(unsigned long u, unsigned char *buf)
+{
+       int i;
+       unsigned long max_val[MAX_UTF8_SZ] = {
+               0x0000007ful, 0x000007fful, 0x0000fffful,
+               0x001ffffful, 0x03fffffful, 0x7ffffffful
+       };
+       static const int MAX_VAL_SZ = sizeof(max_val) / sizeof(max_val[0]);
+
+       for (i = 0; i < MAX_VAL_SZ; ++i) {
+               if (u <= max_val[i])
+                       break;
+       }
+       if (i == MAX_VAL_SZ) {
+               // This code point is too big to encode.
+               return -1;
+       }
+
+       if (i == 0) {
+               buf[0] = u;
+       }
+       else {
+               signed int j;
+               for (j = i; j > 0; --j) {
+                       buf[j] = 0x80 | (u & 0x3f);
+                       u >>= 6;
+               }
+
+               unsigned char mask = ~(0xFF >> (i + 1));
+               buf[0] = mask | u;
+       }
+
+       return i + 1;
+}
+
+/*
+ * Decode a UTF8 character from an array of bytes. Return character code.
+ * Upon error, return INVALID_UTF8_CHAR.
+ */
+static unsigned long decode_utf8(unsigned char *buf, int nbytes)
+{
+       unsigned long code;
+       int i, j;
+
+       if (nbytes <= 0)
+               return INVALID_UTF8_CHAR;
+
+       if (nbytes == 1) {
+               if (buf[0] >= 0x80)
+                       return INVALID_UTF8_CHAR;
+               return buf[0];
+       }
+
+       i = high_bits_set(buf[0]);
+       if (i != nbytes)
+               return INVALID_UTF8_CHAR;
+       code = buf[0] & (0xff >> i);
+       for (j = 1; j < nbytes; ++j) {
+               if ((buf[j] & 0xc0) != 0x80)
+                           return INVALID_UTF8_CHAR;
+               code = (code << 6) | (buf[j] & 0x3f);
+       }
+
+       // Check for invalid code points
+       if (code == 0xFFFE)
+           return INVALID_UTF8_CHAR;
+       if (code == 0xFFFF)
+           return INVALID_UTF8_CHAR;
+       if (code >= 0xD800 && code <= 0xDFFF)
+           return INVALID_UTF8_CHAR;
+
+       return code;
+}
+
+int check_utf8(const char *buf, int len)
+{
+       unsigned char u[MAX_UTF8_SZ];
+       int enc_len = 0;
+       int i = 0;
+       while (1) {
+               unsigned int c = buf[i];
+               if (i >= len || c < 0x80 || (c & 0xC0) != 0x80) {
+                       // the start of a new character. Process what we have
+                       // in the buffer.
+                       if (enc_len > 0) {
+                               int re_encoded_len;
+                               unsigned char re_encoded[MAX_UTF8_SZ];
+                               unsigned long code = decode_utf8(u, enc_len);
+                               if (code == INVALID_UTF8_CHAR) {
+                                       //printf("decoded to invalid utf8");
+                                       return i + 1;
+                               }
+                               re_encoded_len = encode_utf8(code, re_encoded);
+                               if (enc_len != re_encoded_len) {
+                                       //printf("originally encoded as %d bytes, "
+                                       //      "but was re-encoded to %d!\n",
+                                       //      enc_len, re_encoded_len);
+                                       return i + 1;
+                               }
+                               if (memcmp(u, re_encoded, enc_len) != 0) {
+                                       //printf("re-encoded to a different "
+                                       //      "byte stream!");
+                                       return i + 1;
+                               }
+                               //printf("code_point %lu\n", code);
+                       }
+                       enc_len = 0;
+                       if (i >= len)
+                               break;
+                       // start collecting again?
+                       if (c >= 0x80)
+                               u[enc_len++] = c;
+               } else {
+                       if (enc_len == MAX_UTF8_SZ) {
+                               //printf("too many enc_len in utf character!\n");
+                               return i + 1;
+                       }
+                       //printf("continuation byte...\n");
+                       u[enc_len++] = c;
+               }
+               ++i;
+       }
+       return 0;
+}
+
+int check_utf8_cstr(const char *buf)
+{
+       return check_utf8(buf, strlen(buf));
+}
diff --git a/src/common/utf8.h b/src/common/utf8.h

new file mode 100644 (file)

index 0000000..e1c891c
--- /dev/null
+++ b/src/common/utf8.h
@@ -0,0 +1,38 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_COMMON_UTF8_H
+#define CEPH_COMMON_UTF8_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Checks if a buffer is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8(const char *buf, int len);
+
+/* Checks if a null-terminated string is valid UTF-8.
+ * Returns 0 if it is, and one plus the offset of the first invalid byte
+ * if it is not.
+ */
+int check_utf8_cstr(const char *buf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/test/utf8.cc b/src/test/utf8.cc

new file mode 100644 (file)

index 0000000..7c1a4de
--- /dev/null
+++ b/src/test/utf8.cc
@@ -0,0 +1,49 @@
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2011 New Dream Network
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+#include "common/utf8.h"
+#include "gtest/gtest.h"
+#include <stdint.h>
+
+TEST(IsValidUtf8, SimpleAscii) {
+  ASSERT_EQ(0, check_utf8_cstr("Ascii ftw."));
+  ASSERT_EQ(0, check_utf8_cstr(""));
+  ASSERT_EQ(0, check_utf8_cstr("B"));
+  ASSERT_EQ(0, check_utf8_cstr("Badgers badgers badgers badgers "
+                              "mushroom mushroom"));
+  ASSERT_EQ(0, check_utf8("foo", strlen("foo")));
+}
+
+TEST(IsValidUtf8, ControlChars) {
+  // Sadly, control characters are valid utf8...
+  uint8_t control_chars[] = { 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 
+                             0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d };
+  ASSERT_EQ(0, check_utf8((char*)control_chars, sizeof(control_chars)));
+}
+
+TEST(IsValidUtf8, SimpleUtf8) {
+  uint8_t funkystr[] = { 0x66, 0xd1, 0x86, 0xd1, 0x9d, 0xd2, 0xa0, 0xd3,
+                      0xad, 0xd3, 0xae, 0x0a };
+  ASSERT_EQ(0, check_utf8((char*)funkystr, sizeof(funkystr)));
+
+  uint8_t valid2[] = { 0xc3, 0xb1 };
+  ASSERT_EQ(0, check_utf8((char*)valid2, sizeof(valid2)));
+}
+
+TEST(IsValidUtf8, InvalidUtf8) {
+  uint8_t inval[] = { 0xe2, 0x28, 0xa1 };
+  ASSERT_NE(0, check_utf8((char*)inval, sizeof(inval)));
+
+  uint8_t invalid2[] = { 0xc3, 0x28 };
+  ASSERT_NE(0, check_utf8((char*)invalid2, sizeof(invalid2)));
+}
author	Colin Patrick McCabe <cmccabe@alumni.cmu.edu>
	Tue, 22 Mar 2011 14:20:58 +0000 (07:20 -0700)
committer	Colin Patrick McCabe <cmccabe@alumni.cmu.edu>
	Tue, 22 Mar 2011 14:30:47 +0000 (07:30 -0700)
src/Makefile.am		patch \| blob \| history
src/common/utf8.c	[new file with mode: 0644]	patch \| blob
src/common/utf8.h	[new file with mode: 0644]	patch \| blob
src/test/utf8.cc	[new file with mode: 0644]	patch \| blob