rune.c (4071B)
1 /* 2 * The authors of this software are Rob Pike and Ken Thompson. 3 * Copyright (c) 2002 by Lucent Technologies. 4 * Permission to use, copy, modify, and distribute this software for any 5 * purpose without fee is hereby granted, provided that this entire notice 6 * is included in all copies of any software which is or includes a copy 7 * or modification of this software and in all copies of the supporting 8 * documentation for such software. 9 * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED 10 * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE 11 * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY 12 * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE. 13 */ 14 #include "plan9.h" 15 #include "utf.h" 16 17 #include <stdarg.h> 18 #include <string.h> 19 20 enum { 21 Bit1 = 7, 22 Bitx = 6, 23 Bit2 = 5, 24 Bit3 = 4, 25 Bit4 = 3, 26 Bit5 = 2, 27 28 T1 = ((1 << (Bit1 + 1)) - 1) ^ 0xFF, /* 0000 0000 */ 29 Tx = ((1 << (Bitx + 1)) - 1) ^ 0xFF, /* 1000 0000 */ 30 T2 = ((1 << (Bit2 + 1)) - 1) ^ 0xFF, /* 1100 0000 */ 31 T3 = ((1 << (Bit3 + 1)) - 1) ^ 0xFF, /* 1110 0000 */ 32 T4 = ((1 << (Bit4 + 1)) - 1) ^ 0xFF, /* 1111 0000 */ 33 T5 = ((1 << (Bit5 + 1)) - 1) ^ 0xFF, /* 1111 1000 */ 34 35 Rune1 = (1 << (Bit1 + 0 * Bitx)) - 1, /* 0000 0000 0000 0000 0111 1111 */ 36 Rune2 = (1 << (Bit2 + 1 * Bitx)) - 1, /* 0000 0000 0000 0111 1111 1111 */ 37 Rune3 = (1 << (Bit3 + 2 * Bitx)) - 1, /* 0000 0000 1111 1111 1111 1111 */ 38 Rune4 = (1 << (Bit4 + 3 * Bitx)) - 1, /* 0011 1111 1111 1111 1111 1111 */ 39 40 Maskx = (1 << Bitx) - 1, /* 0011 1111 */ 41 Testx = Maskx ^ 0xFF, /* 1100 0000 */ 42 43 Bad = Runeerror 44 }; 45 46 int chartorune(Rune* rune, char* str) { 47 int c, c1, c2, c3; 48 long l; 49 50 /* 51 * one character sequence 52 * 00000-0007F => T1 53 */ 54 c = *(uchar*) str; 55 if (c < Tx) { 56 *rune = c; 57 return 1; 58 } 59 60 /* 61 * two character sequence 62 * 0080-07FF => T2 Tx 63 */ 64 c1 = *(uchar*) (str + 1) ^ Tx; 65 if (c1 & Testx) 66 goto bad; 67 if (c < T3) { 68 if (c < T2) 69 goto bad; 70 l = ((c << Bitx) | c1) & Rune2; 71 if (l <= Rune1) 72 goto bad; 73 *rune = l; 74 return 2; 75 } 76 77 /* 78 * three character sequence 79 * 0800-FFFF => T3 Tx Tx 80 */ 81 c2 = *(uchar*) (str + 2) ^ Tx; 82 if (c2 & Testx) 83 goto bad; 84 if (c < T4) { 85 l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3; 86 if (l <= Rune2) 87 goto bad; 88 *rune = l; 89 return 3; 90 } 91 92 /* 93 * four character sequence 94 * 10000-10FFFF => T4 Tx Tx Tx 95 */ 96 if (UTFmax >= 4) { 97 c3 = *(uchar*) (str + 3) ^ Tx; 98 if (c3 & Testx) 99 goto bad; 100 if (c < T5) { 101 l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; 102 if (l <= Rune3) 103 goto bad; 104 if (l > Runemax) 105 goto bad; 106 *rune = l; 107 return 4; 108 } 109 } 110 111 /* 112 * bad decoding 113 */ 114 bad: 115 *rune = Bad; 116 return 1; 117 } 118 119 int runetochar(char* str, Rune* rune) { 120 long c; 121 122 /* 123 * one character sequence 124 * 00000-0007F => 00-7F 125 */ 126 c = *rune; 127 if (c <= Rune1) { 128 str[0] = c; 129 return 1; 130 } 131 132 /* 133 * two character sequence 134 * 00080-007FF => T2 Tx 135 */ 136 if (c <= Rune2) { 137 str[0] = T2 | (c >> 1 * Bitx); 138 str[1] = Tx | (c & Maskx); 139 return 2; 140 } 141 142 /* 143 * three character sequence 144 * 00800-0FFFF => T3 Tx Tx 145 */ 146 if (c > Runemax) 147 c = Runeerror; 148 if (c <= Rune3) { 149 str[0] = T3 | (c >> 2 * Bitx); 150 str[1] = Tx | ((c >> 1 * Bitx) & Maskx); 151 str[2] = Tx | (c & Maskx); 152 return 3; 153 } 154 155 /* 156 * four character sequence 157 * 010000-1FFFFF => T4 Tx Tx Tx 158 */ 159 str[0] = T4 | (c >> 3 * Bitx); 160 str[1] = Tx | ((c >> 2 * Bitx) & Maskx); 161 str[2] = Tx | ((c >> 1 * Bitx) & Maskx); 162 str[3] = Tx | (c & Maskx); 163 return 4; 164 } 165 166 int runelen(long c) { 167 Rune rune; 168 char str[10]; 169 170 rune = c; 171 return runetochar(str, &rune); 172 } 173 174 int runenlen(Rune* r, int nrune) { 175 int nb, c; 176 177 nb = 0; 178 while (nrune--) { 179 c = *r++; 180 if (c <= Rune1) 181 nb++; 182 else if (c <= Rune2) 183 nb += 2; 184 else if (c <= Rune3 || c > Runemax) 185 nb += 3; 186 else 187 nb += 4; 188 } 189 return nb; 190 } 191 192 int fullrune(char* str, int n) { 193 int c; 194 195 if (n <= 0) 196 return 0; 197 c = *(uchar*) str; 198 if (c < Tx) 199 return 1; 200 if (c < T3) 201 return n >= 2; 202 if (UTFmax == 3 || c < T4) 203 return n >= 3; 204 return n >= 4; 205 }