Browse Source

Add support for looking up the general category group for a codepoint.

master
Reece H. Dunn 12 years ago
parent
commit
ff7a5e0209
3 changed files with 69 additions and 18 deletions
  1. 21
    0
      src/categories.cpp
  2. 25
    18
      src/include/ucd/ucd.h
  3. 23
    0
      tools/categories.py

+ 21
- 0
src/categories.cpp View File

if (c <= 0x10FFFF) return Cn; // 10FFFE..10FFFF : Plane 16 Private Use if (c <= 0x10FFFF) return Cn; // 10FFFE..10FFFF : Plane 16 Private Use
return Ci; return Ci;
} }

ucd::category_group ucd::lookup_category_group(codepoint_t c)
{
switch (lookup_category(c))
{
case Cc: case Cf: case Ci: case Cn: case Co: case Cs: case Zc:
return C;
case Ll: case Lm: case Lo: case Lt: case Lu:
return L;
case Mc: case Me: case Mn:
return M;
case Nd: case Nl: case No:
return N;
case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
return P;
case Sc: case Sk: case Sm: case So:
return S;
case Zl: case Zp: case Zs:
return Z;
}
}

+ 25
- 18
src/include/ucd/ucd.h View File

*/ */
typedef uint32_t codepoint_t; typedef uint32_t codepoint_t;


/** @brief Unicode General Category Groups
* @see http://www.unicode.org/reports/tr44/
*/
enum category_group
{
C, /**< @brief Other */
L, /**< @brief Letter */
M, /**< @brief Mark */
N, /**< @brief Number */
P, /**< @brief Punctuation */
S, /**< @brief Symbol */
Z, /**< @brief Separator */
};

/** @brief Unicode General Category Values /** @brief Unicode General Category Values
* @see http://www.unicode.org/reports/tr44/ * @see http://www.unicode.org/reports/tr44/
*/ */
enum category enum category
{ {
// Other

Cc, /**< @brief Control Character */ Cc, /**< @brief Control Character */
Cf, /**< @brief Format Control Character */ Cf, /**< @brief Format Control Character */
Ci, /**< @brief Invalid Unicode Character */ Ci, /**< @brief Invalid Unicode Character */
Co, /**< @brief Private Use */ Co, /**< @brief Private Use */
Cs, /**< @brief Surrogate Code Point */ Cs, /**< @brief Surrogate Code Point */


// Letter

Ll, /**< @brief Lower Case Letter */ Ll, /**< @brief Lower Case Letter */
Lm, /**< @brief Letter Modifier */ Lm, /**< @brief Letter Modifier */
Lo, /**< @brief Other Letter */ Lo, /**< @brief Other Letter */
Lt, /**< @brief Title Case Letter */ Lt, /**< @brief Title Case Letter */
Lu, /**< @brief Upper Case Letter */ Lu, /**< @brief Upper Case Letter */


// Mark

Mc, /**< @brief Spacing Mark */ Mc, /**< @brief Spacing Mark */
Me, /**< @brief Enclosing Mark */ Me, /**< @brief Enclosing Mark */
Mn, /**< @brief Non-Spacing Mark */ Mn, /**< @brief Non-Spacing Mark */


// Symbol

Sc, /**< @brief Currency Symbol */
Sk, /**< @brief Modifier Symbol */
Sm, /**< @brief Math Symbol */
So, /**< @brief Other Symbol */

// Number

Nd, /**< @brief Decimal Digit */ Nd, /**< @brief Decimal Digit */
Nl, /**< @brief Letter-Like Number */ Nl, /**< @brief Letter-Like Number */
No, /**< @brief Other Number */ No, /**< @brief Other Number */


// Punctuation

Pc, /**< @brief Connector */ Pc, /**< @brief Connector */
Pd, /**< @brief Dash/Hyphen */ Pd, /**< @brief Dash/Hyphen */
Pe, /**< @brief Close Punctuation Mark */ Pe, /**< @brief Close Punctuation Mark */
Po, /**< @brief Other */ Po, /**< @brief Other */
Ps, /**< @brief Open PUnctuation Mark */ Ps, /**< @brief Open PUnctuation Mark */


// Separator
Sc, /**< @brief Currency Symbol */
Sk, /**< @brief Modifier Symbol */
Sm, /**< @brief Math Symbol */
So, /**< @brief Other Symbol */


Zc, /**< @brief Whitespace character in the Cc category */ Zc, /**< @brief Whitespace character in the Cc category */
Zl, /**< @brief Line Separator */ Zl, /**< @brief Line Separator */
Zs, /**< @brief Space Separator */ Zs, /**< @brief Space Separator */
}; };


/** @brief Lookup the General Category Group for a Unicode codepoint.
*
* @param c The Unicode codepoint to lookup.
* @return The General Category Group of the Unicode codepoint.
*/
category_group lookup_category_group(codepoint_t c);

/** @brief Lookup the General Category for a Unicode codepoint. /** @brief Lookup the General Category for a Unicode codepoint.
* *
* @param c The Unicode codepoint to lookup. * @param c The Unicode codepoint to lookup.

+ 23
- 0
tools/categories.py View File

sys.stdout.write('\t}\n') sys.stdout.write('\t}\n')
sys.stdout.write('\treturn Ci;\n') sys.stdout.write('\treturn Ci;\n')
sys.stdout.write('}\n') sys.stdout.write('}\n')

sys.stdout.write("""
ucd::category_group ucd::lookup_category_group(codepoint_t c)
{
switch (lookup_category(c))
{
case Cc: case Cf: case Ci: case Cn: case Co: case Cs: case Zc:
return C;
case Ll: case Lm: case Lo: case Lt: case Lu:
return L;
case Mc: case Me: case Mn:
return M;
case Nd: case Nl: case No:
return N;
case Pc: case Pd: case Pe: case Pf: case Pi: case Po: case Ps:
return P;
case Sc: case Sk: case Sm: case So:
return S;
case Zl: case Zp: case Zs:
return Z;
}
}
""")

Loading…
Cancel
Save