// Copyright (C) 2002 Graydon Hoare <graydon@pobox.com>
//
// This program is made available under the GNU GPL version 2.0 or
// greater. See the accompanying file COPYING for details.
//
// This program is distributed WITHOUT ANY WARRANTY; without even the
// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
// PURPOSE.

#include "../../../src/base.hh"

#include <cstdlib>

#include "../unit_tests.hh"
#include "../../../src/charset.hh"
#include "../../../src/simplestring_xform.hh"

#define IDNA_ACE_PREFIX "xn--"
#define IDNA_SUCCESS 0

using std::string;

struct
idna
{
  char const * name;
  char const * utf;
  char const * ace;
} const idna_vec[] =
  {
    // In C, \x escapes consume an unbounded number of hexadecimal digits,
    // and if the resulting number is too big for a byte it is a semantic
    // error.  However, if a string constant is composed of more than one
    // string literal, they do not extend across a boundary between string
    // literals.  Thus, in some places in this array, string literals have
    // been split solely to end \x escapes after two hex digits.
    {
      "Arabic (Egyptian)",
      "\xd9\x84\xd9\x8a\xd9\x87\xd9\x85\xd8\xa7\xd8\xa8\xd8\xaa\xd9\x83\xd9"
      "\x84\xd9\x85\xd9\x88\xd8\xb4\xd8\xb9\xd8\xb1\xd8\xa8\xd9\x8a\xd8\x9f",
      IDNA_ACE_PREFIX "egbpdaj6bu4bxfgehfvwxn" },
    {
      "Chinese (simplified)",
      "\xe4\xbb\x96\xe4\xbb\xac\xe4\xb8\xba\xe4\xbb\x80\xe4\xb9\x88\xe4\xb8"
      "\x8d\xe8\xaf\xb4\xe4\xb8\xad\xe6\x96\x87",
      IDNA_ACE_PREFIX "ihqwcrb4cv8a8dqg056pqjye" },
    {
      "Chinese (traditional)",
      "\xe4\xbb\x96\xe5\x80\x91\xe7\x88\xb2\xe4\xbb\x80\xe9\xba\xbd\xe4\xb8"
      "\x8d\xe8\xaa\xaa\xe4\xb8\xad\xe6\x96\x87",
      IDNA_ACE_PREFIX "ihqwctvzc91f659drss3x8bo0yb" },
    {
      "Czech",
      "Pro\xc4\x8dprost\xc4\x9bnemluv\xc3\xad\xc4\x8d""esky",
      IDNA_ACE_PREFIX "Proprostnemluvesky-uyb24dma41a"},
    {
      "Hebrew",
      "\xd7\x9c\xd7\x9e\xd7\x94\xd7\x94\xd7\x9d\xd7\xa4\xd7\xa9\xd7\x95\xd7"
      "\x98\xd7\x9c\xd7\x90\xd7\x9e\xd7\x93\xd7\x91\xd7\xa8\xd7\x99\xd7\x9d"
      "\xd7\xa2\xd7\x91\xd7\xa8\xd7\x99\xd7\xaa",
      IDNA_ACE_PREFIX "4dbcagdahymbxekheh6e0a7fei0b"},
    {
      "Hindi (Devanagari)",
      "\xe0\xa4\xaf\xe0\xa4\xb9\xe0\xa4\xb2\xe0\xa5\x8b\xe0\xa4\x97\xe0\xa4"
      "\xb9\xe0\xa4\xbf\xe0\xa4\xa8\xe0\xa5\x8d\xe0\xa4\xa6\xe0\xa5\x80\xe0"
      "\xa4\x95\xe0\xa5\x8d\xe0\xa4\xaf\xe0\xa5\x8b\xe0\xa4\x82\xe0\xa4\xa8"
      "\xe0\xa4\xb9\xe0\xa5\x80\xe0\xa4\x82\xe0\xa4\xac\xe0\xa5\x8b\xe0\xa4"
      "\xb2\xe0\xa4\xb8\xe0\xa4\x95\xe0\xa4\xa4\xe0\xa5\x87\xe0\xa4\xb9\xe0"
      "\xa5\x88\xe0\xa4\x82",
      IDNA_ACE_PREFIX "i1baa7eci9glrd9b2ae1bj0hfcgg6iyaf8o0a1dig0cd"},
    {
      "Japanese (kanji and hiragana)",
      "\xe3\x81\xaa\xe3\x81\x9c\xe3\x81\xbf\xe3\x82\x93\xe3\x81\xaa\xe6\x97"
      "\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x82\x92\xe8\xa9\xb1\xe3\x81\x97\xe3"
      "\x81\xa6\xe3\x81\x8f\xe3\x82\x8c\xe3\x81\xaa\xe3\x81\x84\xe3\x81\xae"
      "\xe3\x81\x8b",
      IDNA_ACE_PREFIX "n8jok5ay5dzabd5bym9f0cm5685rrjetr6pdxa"},
    {
      "Russian (Cyrillic)",
      "\xd0\xbf\xd0\xbe\xd1\x87\xd0\xb5\xd0\xbc\xd1\x83\xd0\xb6\xd0\xb5\xd0"
      "\xbe\xd0\xbd\xd0\xb8\xd0\xbd\xd0\xb5\xd0\xb3\xd0\xbe\xd0\xb2\xd0\xbe"
      "\xd1\x80\xd1\x8f\xd1\x82\xd0\xbf\xd0\xbe\xd1\x80\xd1\x83\xd1\x81\xd1"
      "\x81\xd0\xba\xd0\xb8",
      IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l"},
    {
      "Spanish",
      "Porqu\xc3\xa9nopuedensimplementehablarenEspa\xc3\xb1ol",
      IDNA_ACE_PREFIX "PorqunopuedensimplementehablarenEspaol-fmd56a"},
    {
      "Vietnamese",
      "T\xe1\xba\xa1isaoh\xe1\xbb\x8dkh\xc3\xb4ngth\xe1\xbb\x83""ch\xe1\xbb"
      "\x89n\xc3\xb3iti\xe1\xba\xbfngVi\xe1\xbb\x87t",
      IDNA_ACE_PREFIX "TisaohkhngthchnitingVit-kjcr8268qyxafd2f1b9g"},
    {
      "Japanese",
      "3\xe5\xb9\xb4""B\xe7\xb5\x84\xe9\x87\x91\xe5\x85\xab\xe5\x85\x88\xe7"
      "\x94\x9f",
      IDNA_ACE_PREFIX "3B-ww4c5e180e575a65lsy2b"},
    {
      "Japanese",
      "\xe5\xae\x89\xe5\xae\xa4\xe5\xa5\x88\xe7\xbe\x8e\xe6\x81\xb5-with-"
      "SUPER-MONKEYS",
      IDNA_ACE_PREFIX "-with-SUPER-MONKEYS-pc58ag80a8qai00g7n9n"},
    {
      "Japanese",
      "Hello-Another-Way-\xe3\x81\x9d\xe3\x82\x8c\xe3\x81\x9e\xe3\x82\x8c"
      "\xe3\x81\xae\xe5\xa0\xb4\xe6\x89\x80",
      IDNA_ACE_PREFIX "Hello-Another-Way--fc4qua05auwb3674vfr0b"},
    {
      "Japanese",
      "\xe3\x81\xb2\xe3\x81\xa8\xe3\x81\xa4\xe5\xb1\x8b\xe6\xa0\xb9\xe3\x81"
      "\xae\xe4\xb8\x8b""2",
      IDNA_ACE_PREFIX "2-u9tlzr9756bt3uc0v"},
    {
      "Japanese",
      "Maji\xe3\x81\xa7Koi\xe3\x81\x99\xe3\x82\x8b""5\xe7\xa7\x92\xe5\x89\x8d",
      IDNA_ACE_PREFIX "MajiKoi5-783gue6qz075azm5e"},
    {
      "Japanese",
      "\xe3\x83\x91\xe3\x83\x95\xe3\x82\xa3\xe3\x83\xbc""de\xe3\x83\xab\xe3\x83"
      "\xb3\xe3\x83\x90",
      IDNA_ACE_PREFIX "de-jg4avhby1noc0d"},
    {
      "Japanese",
      "\xe3\x81\x9d\xe3\x81\xae\xe3\x82\xb9\xe3\x83\x94\xe3\x83\xbc\xe3\x83"
      "\x89\xe3\x81\xa7",
      IDNA_ACE_PREFIX "d9juau41awczczp"},
    {
      "Greek",
      "\xce\xb5\xce\xbb\xce\xbb\xce\xb7\xce\xbd\xce\xb9\xce\xba\xce\xac",
      IDNA_ACE_PREFIX "hxargifdar"},
    {
      "Maltese (Malti)",
      "bon\xc4\xa1usa\xc4\xa7\xc4\xa7""a",
      IDNA_ACE_PREFIX "bonusaa-5bb1da"},
    {
      "Russian (Cyrillic)",
      "\xd0\xbf\xd0\xbe\xd1\x87\xd0\xb5\xd0\xbc\xd1\x83\xd0\xb6\xd0\xb5\xd0"
      "\xbe\xd0\xbd\xd0\xb8\xd0\xbd\xd0\xb5\xd0\xb3\xd0\xbe\xd0\xb2\xd0\xbe"
      "\xd1\x80\xd1\x8f\xd1\x82\xd0\xbf\xd0\xbe\xd1\x80\xd1\x83\xd1\x81\xd1"
      "\x81\xd0\xba\xd0\xb8",
      IDNA_ACE_PREFIX "b1abfaaepdrnnbgefbadotcwatmq2g4l"},
  };

UNIT_TEST(idna_encoding)
{
  // putenv takes a char*, not a const char*, there is nothing we can do.
  putenv(const_cast<char *>("CHARSET=UTF-8"));

  for (size_t i = 0; i < sizeof(idna_vec) / sizeof(struct idna); ++i)
    {
      UNIT_TEST_CHECKPOINT(("IDNA language: "
                            + string(idna_vec[i].name)).c_str());

      string u = lowercase(idna_vec[i].utf);
      string a = lowercase(idna_vec[i].ace);
      string tace;
      utf8_to_ace(utf8(u, origin::internal), tace);
      L(FL("ACE-encoded %s: '%s'") % idna_vec[i].name % tace);
      UNIT_TEST_CHECK(a == lowercase(tace));

      utf8 tutf;
      ace_to_utf8(a, tutf, origin::internal);
      L(FL("UTF-encoded %s: '%s'") % idna_vec[i].name % tutf);
      UNIT_TEST_CHECK(u == lowercase(tutf()));
    }
}

UNIT_TEST(utf8_validation)
{
  // these tests are based on the tests from the file utf8-validate.c of the
  // GLib library, and also include sequences from Markus Kuhn's UTF-8
  // example files.
  const char* good_strings[] = {
    "this is a valid but boring ASCII string",

    "\x28\x28\x56\xe2\x8d\xb3\x56\x29\x3d\xe2\x8d\xb3\xe2\x8d\xb4\x56\x29\x2f"
    "\x56\xe2\x86\x90\x2c\x56\x20\x20\x20\x20\xe2\x8c\xb7\xe2\x86\x90\xe2\x8d"
    "\xb3\xe2\x86\x92\xe2\x8d\xb4\xe2\x88\x86\xe2\x88\x87\xe2\x8a\x83\xe2\x80"
    "\xbe\xe2\x8d\x8e\xe2\x8d\x95\xe2\x8c\x88",

    "\xe2\x80\x98\x73\x69\x6e\x67\x6c\x65\xe2\x80\x99\x20\x61\x6e\x64\x20\xe2"
    "\x80\x9c\x64\x6f\x75\x62\x6c\x65\xe2\x80\x9d\x20\x71\x75\x6f\x74\x65\x73",

    "\xe2\x80\xa2\x20\x43\x75\x72\x6c\x79\x20\x61\x70\x6f\x73\x74\x72\x6f\x70"
    "\x68\x65\x73\x3a\x20\xe2\x80\x9c\x57\x65\xe2\x80\x99\x76\x65\x20\x62\x65"
    "\x65\x6e\x20\x68\x65\x72\x65\xe2\x80\x9d",

    "\xe2\x80\x9a\x64\x65\x75\x74\x73\x63\x68\x65\xe2\x80\x98\x20\xe2\x80\x9e"
    "\x41\x6e\x66\xc3\xbc\x68\x72\x75\x6e\x67\x73\x7a\x65\x69\x63\x68\x65\x6e"
    "\xe2\x80\x9c",

    "\xe2\x80\xa0\x2c\x20\xe2\x80\xa1\x2c\x20\xe2\x80\xb0\x2c\x20\xe2\x80\xa2"
    "\x2c\x20\x33\xe2\x80\x93\x34\x2c\x20\xe2\x80\x94\x2c\x20\xe2\x88\x92\x35"
    "\x2f\x2b\x35\x2c\x20\xe2\x84\xa2\x2c\x20\xe2\x80\xa6",

    "\xc2\xa9\xc2\xa9\xc2\xa9",
    "\xe2\x89\xa0\xe2\x89\xa0",
    "\xce\xba\xe1\xbd\xb9\xcf\x83\xce\xbc\xce\xb5",
    "\x00",
    "\xc2\x80",
    "\xe0\xa0\x80",
    "\xf0\x90\x80\x80",
    "\x7f",
    "\xdf\xbf",
    "\xed\x9f\xbf",
    "\xee\x80\x80",
    "\xef\xbf\xbd",
    0
  };
  const char* bad_strings[] = {
    "\xf8\x88\x80\x80\x80",
    "\xfc\x84\x80\x80\x80\x80",
    "\xef\xbf\xbf",
    "\xf7\xbf\xbf\xbf",
    "\xfb\xbf\xbf\xbf\xbf",
    "\xfd\xbf\xbf\xbf\xbf\xbf",
    "\xf4\x8f\xbf\xbf",
    "\xf4\x90\x80\x80",
    "\x80",
    "\xbf",
    "\x80\xbf",
    "\x80\xbf\x80",
    "\x80\xbf\x80\xbf",
    "\x80\xbf\x80\xbf\x80",
    "\x80\xbf\x80\xbf\x80\xbf",
    "\x80\xbf\x80\xbf\x80\xbf\x80",
    "\x80",
    "\x81",
    "\x82",
    "\x83",
    "\x84",
    "\x85",
    "\x86",
    "\x87",
    "\x88",
    "\x89",
    "\x8a",
    "\x8b",
    "\x8c",
    "\x8d",
    "\x8e",
    "\x8f",
    "\x90",
    "\x91",
    "\x92",
    "\x93",
    "\x94",
    "\x95",
    "\x96",
    "\x97",
    "\x98",
    "\x99",
    "\x9a",
    "\x9b",
    "\x9c",
    "\x9d",
    "\x9e",
    "\x9f",
    "\xa0",
    "\xa1",
    "\xa2",
    "\xa3",
    "\xa4",
    "\xa5",
    "\xa6",
    "\xa7",
    "\xa8",
    "\xa9",
    "\xaa",
    "\xab",
    "\xac",
    "\xad",
    "\xae",
    "\xaf",
    "\xb0",
    "\xb1",
    "\xb2",
    "\xb3",
    "\xb4",
    "\xb5",
    "\xb6",
    "\xb7",
    "\xb8",
    "\xb9",
    "\xba",
    "\xbb",
    "\xbc",
    "\xbd",
    "\xbe",
    "\xbf",
    "\xc0\x20",
    "\xc1\x20",
    "\xc2\x20",
    "\xc3\x20",
    "\xc4\x20",
    "\xc5\x20",
    "\xc6\x20",
    "\xc7\x20",
    "\xc8\x20",
    "\xc9\x20",
    "\xca\x20",
    "\xcb\x20",
    "\xcc\x20",
    "\xcd\x20",
    "\xce\x20",
    "\xcf\x20",
    "\xd0\x20",
    "\xd1\x20",
    "\xd2\x20",
    "\xd3\x20",
    "\xd4\x20",
    "\xd5\x20",
    "\xd6\x20",
    "\xd7\x20",
    "\xd8\x20",
    "\xd9\x20",
    "\xda\x20",
    "\xdb\x20",
    "\xdc\x20",
    "\xdd\x20",
    "\xde\x20",
    "\xdf\x20",
    "\xe0\x20",
    "\xe1\x20",
    "\xe2\x20",
    "\xe3\x20",
    "\xe4\x20",
    "\xe5\x20",
    "\xe6\x20",
    "\xe7\x20",
    "\xe8\x20",
    "\xe9\x20",
    "\xea\x20",
    "\xeb\x20",
    "\xec\x20",
    "\xed\x20",
    "\xee\x20",
    "\xef\x20",
    "\xf0\x20",
    "\xf1\x20",
    "\xf2\x20",
    "\xf3\x20",
    "\xf4\x20",
    "\xf5\x20",
    "\xf6\x20",
    "\xf7\x20",
    "\xf8\x20",
    "\xf9\x20",
    "\xfa\x20",
    "\xfb\x20",
    "\xfc\x20",
    "\xfd\x20",
    "\x20\xc0",
    "\x20\xe0\x80",
    "\x20\xf0\x80\x80",
    "\x20\xf8\x80\x80\x80",
    "\x20\xfc\x80\x80\x80\x80",
    "\x20\xdf",
    "\x20\xef\xbf",
    "\x20\xf7\xbf\xbf",
    "\x20\xfb\xbf\xbf\xbf",
    "\x20\xfd\xbf\xbf\xbf\xbf",
    "\x20\xfe\x20",
    "\x20\xff\x20",
    "\x20\xc0\xaf\x20",
    "\x20\xe0\x80\xaf\x20",
    "\x20\xf0\x80\x80\xaf\x20",
    "\x20\xf8\x80\x80\x80\xaf\x20",
    "\x20\xfc\x80\x80\x80\x80\xaf\x20",
    "\x20\xc1\xbf\x20",
    "\x20\xe0\x9f\xbf\x20",
    "\x20\xf0\x8f\xbf\xbf\x20",
    "\x20\xf8\x87\xbf\xbf\xbf\x20",
    "\x20\xfc\x83\xbf\xbf\xbf\xbf\x20",
    "\x20\xc0\x80\x20",
    "\x20\xe0\x80\x80\x20",
    "\x20\xf0\x80\x80\x80\x20",
    "\x20\xf8\x80\x80\x80\x80\x20",
    "\x20\xfc\x80\x80\x80\x80\x80\x20",
    "\x20\xed\xa0\x80\x20",
    "\x20\xed\xad\xbf\x20",
    "\x20\xed\xae\x80\x20",
    "\x20\xed\xaf\xbf\x20",
    "\x20\xed\xb0\x80\x20",
    "\x20\xed\xbe\x80\x20",
    "\x20\xed\xbf\xbf\x20",
    "\x20\xed\xa0\x80\xed\xb0\x80\x20",
    "\x20\xed\xa0\x80\xed\xbf\xbf\x20",
    "\x20\xed\xad\xbf\xed\xb0\x80\x20",
    "\x20\xed\xad\xbf\xed\xbf\xbf\x20",
    "\x20\xed\xae\x80\xed\xb0\x80\x20",
    "\x20\xed\xae\x80\xed\xbf\xbf\x20",
    "\x20\xed\xaf\xbf\xed\xb0\x80\x20",
    "\x20\xed\xaf\xbf\xed\xbf\xbf\x20",
    "\x20\xef\xbf\xbe\x20",
    "\x20\xef\xbf\xbf\x20",
    0
  };

  for (int i = 0; good_strings[i]; ++i)
    UNIT_TEST_CHECK(utf8_validate(utf8(good_strings[i])) == true);

  for (int i = 0; bad_strings[i]; ++i)
    UNIT_TEST_CHECK(utf8_validate(utf8(bad_strings[i])) == false);
}

// Local Variables:
// mode: C++
// fill-column: 76
// c-file-style: "gnu"
// indent-tabs-mode: nil
// End:
// vim: et:sw=2:sts=2:ts=2:cino=>2s,{s,\:s,+s,t0,g0,^-2,e-2,n-2,p2s,(0,=s:
