utf8-parse-test.cpp

Go to the documentation of this file.
00001 /*
00002  * utf8-parse-test.cpp
00003  *
00004  * Copyright (C) 2010  Thomas A. Vaughan
00005  * All rights reserved.
00006  *
00007  * Quick test of parsing routines, with UTF-8 examples.
00008  */
00009 
00010 // includes --------------------------------------------------------------------
00011 #include <iostream>
00012 #include <sstream>
00013 
00014 #include "perf/perf.h"
00015 #include "util/parsing.h"
00016 
00017 
00018 struct test_line_t {
00019         const char *    line;           // a UTF-8 line
00020         int             nTokens;        // number of tokens in line
00021         int             nChars;         // number of characters in line
00022 };
00023 
00024 
00025 // much test data from http://www.columbia.edu/kermit/utf8.html
00026 
00027 static test_line_t s_testLines[] = {
00028           // utf-8 string                               # of tokens     chars
00029         { "This is a test in English",                          6,      25 },
00030         { "I can eat glass.",                                   4,      16 },
00031 
00032         // Erzian
00033         { "Мон ярсан суликадо, ды зыян эйстэнзэ а ули.",      8,      43 },
00034 
00035         // Hindi
00036         { "मैं काँच खा सकता हूँ और मुझे उससे कोई चोट नहीं पहुंचती",
00037         12, 54 },
00038 
00039         // Chinese (Traditional)
00040         { "我能吞下玻璃而不傷身體",                          1,      11 },
00041 
00042         // Russian
00043         { "Я могу есть стекло, оно мне не вредит.",                8,      38 },
00044 
00045         // French
00046         { "Je peux manger du verre, ça ne me fait pas  mal.",  11,     48 },
00047 
00048         // keep this last!
00049         { NULL, 0, 0 }
00050 };
00051 
00052 
00053 
00054 // invalid lines are very important!  Not handling these safely is a
00055 //      security risk
00056 struct invalid_line_t {
00057         const char      line[16];
00058 };
00059 
00060 
00061 static const invalid_line_t s_invalidLines[] = {
00062         // continuation bytes
00063         { { 0x80, 0x00 } },
00064         { { 0x81, 0x00 } },
00065 
00066         // lonely leading characters
00067         { { 0xc0, 0x20, 0x00 } },       // two-byte sequences
00068         { { 0xc1, 0x30, 0x00 } },
00069         { { 0xc1, 0x00 } },
00070         { { 0xe0, 0x20, 0x00 } }, // 3-byte sequences
00071         { { 0xe1, 0x00 } },
00072         { { 0xe1, 0x20, 0x00 } },
00073         { { 0xf0, 0x20, 0x00 } }, // 4-byte sequences
00074         { { 0xf0, 0x00 } },
00075 
00076         // impossible bytes
00077         { { 0xfe, 0x00 } },
00078         { { 0xff, 0x00 } },
00079         { { 0xfe, 0xfe, 0xff, 0xff, 0x00 } },
00080 
00081         // keep this last!
00082         { { 0x00 } }
00083 };
00084 
00085 
00086 ////////////////////////////////////////////////////////////////////////////////
00087 //
00088 //      static helper methods
00089 //
00090 ////////////////////////////////////////////////////////////////////////////////
00091 
00092 static void
00093 testLine
00094 (
00095 IN const test_line_t * p
00096 )
00097 {
00098         ASSERT(p, "null");
00099         ASSERT(p->line, "null");
00100 
00101         int count = 0;
00102         std::string token;
00103         const char * txt = p->line;
00104         DPRINTF("Parsing string: %s", txt);
00105         while (*txt) {
00106                 txt = getNextTokenFromString(txt, token, eParse_None);
00107                 ++count;
00108                 DPRINTF("  Token: '%s'", token.c_str());
00109         }
00110 
00111         // did we count the right number of tokens?
00112         ASSERT_THROW(p->nTokens == count,
00113             "Failed to correctly parse tokens in string: '" << p->line <<
00114             "', read " << count << " tokens instead of " << p->nTokens);
00115 
00116         // now parse characters
00117         DPRINTF("  Parsing characters from string...");
00118         utf8_char_t c;
00119         count = 0;      // reset count
00120         txt = p->line;
00121         while (*txt) {
00122                 txt = getUTF8CharacterFromString(txt, c);
00123                 ++count;
00124                 c.dump("    Character");
00125         }
00126 
00127         // did we count the right number of characters?
00128         ASSERT_THROW(p->nChars == count,
00129             "Failed to correctly parse characters in string: '" << p->line <<
00130             "', read " << count << " characters instead of " << p->nChars);
00131 
00132         // now create a stringstream from the line
00133         DPRINTF("  Parsing characters from stream...");
00134         std::istringstream iss(p->line);
00135         count = 0;
00136         while (getUTF8CharacterFromStream(iss, c)) {
00137                 ++count;
00138                 c.dump("    Character");
00139         }
00140 
00141         // did we count the right number of characters?
00142         ASSERT_THROW(p->nChars == count,
00143             "Failed to correctly parse characters in stream: '" << p->line <<
00144             "', read " << count << " characters instead of " << p->nChars);
00145 }
00146 
00147 
00148 
00149 static void
00150 invalidLine
00151 (
00152 IN const invalid_line_t * p
00153 )
00154 {
00155         ASSERT(p, "null");
00156 
00157         int replaceChar = 0xFFFD;       // Unicode replacement character
00158 
00159         // parse this line
00160         utf8_char_t c;
00161         const char * txt = p->line;
00162         while (*txt) {
00163                 txt = getUTF8CharacterFromString(txt, c);
00164                 c.dump("  invalid character");
00165                 ASSERT_THROW(c.getValue() == replaceChar,
00166                     "Value of parsed character should be replacement char!");
00167         }
00168 }
00169 
00170 
00171 
00172 static void
00173 doTest
00174 (
00175 void
00176 )
00177 {
00178         for (const test_line_t * p = s_testLines; p->line; ++p) {
00179                 testLine(p);
00180         }
00181 
00182         for (const invalid_line_t * p = s_invalidLines; p->line[0]; ++p) {
00183                 invalidLine(p);
00184         }
00185 }
00186 
00187 
00188 
00189 ////////////////////////////////////////////////////////////////////////////////
00190 //
00191 //      entry point
00192 //
00193 ////////////////////////////////////////////////////////////////////////////////
00194 
00195 int
00196 main
00197 (
00198 IN int argc,
00199 IN const char * argv[]
00200 )
00201 {
00202         int retval = 0;
00203 
00204         try {
00205                 perf::Timer timer("full timer");
00206 
00207                 doTest();
00208 
00209         } catch (std::exception& e) {
00210                 DPRINTF("Exception: %s", e.what());
00211                 retval = 1;
00212         }
00213 
00214         perf::dumpTimingSummary(std::cerr);
00215 
00216         return retval;
00217 }
00218