/**
* Convert a file of UTF-8 to HTML.
*/
#include
#include
#include
#include
#include
static int tolerance = 100;
static char *defaultValue;
static char *ochar[2347];
int getUnicodePoint( FILE * infp ) {
int c;
int answer;
int mask;
int topmask;
int bytes;
c = fgetc( infp );
answer = c;
if ( EOF != c && 0x80 & c ) {
bytes = 0;
topmask = 0xff;
for ( mask = 0x80; c & mask; mask >>= 1 ) {
topmask >>= 1;
bytes++;
}
bytes--;
answer = ( c & topmask ) << ( bytes * 6 );
while ( 0 < bytes ) {
c = fgetc( infp );
bytes--;
answer += ( c & 0x3f ) << ( bytes * 6 );
}
assert( EOF != answer );
}
return answer;
}
void replace( int c ) {
fprintf( stderr,
"Unicode code point %d has no equivalent HTML entity.\n", c );
if ( 0 == tolerance ) {
fprintf( stderr, "Too many unmappable characters. Quitting.\n" );
exit( 1 );
}
tolerance--;
/*printf( defaultValue, c );*/
printf( defaultValue );
}
void putUnicodePoint( int c ) {
if ( c < 128 ) {
putchar( c );
}
else if ( c < 1000 ) {
if ( "?" == ochar[c - 128] ) {
replace( c );
}
else {
printf( ochar[c - 128] );
}
}
else if ( c < 10000 ) {
if ( "?" == ochar[c - 7200 - 128] ) {
replace( c );
}
else {
printf( ochar[c - 7200 - 128] );
}
}
else {
if ( "?" == ochar[c - 61900] ) {
replace( c );
}
else {
printf( ochar[c - 61900] );
}
}
}
void doFiles( int argc, char **argv ) {
FILE *infp;
char *infile;
int c;
while ( --argc > 0 ) {
infile = *++argv;
if ( 0 == strcmp( infile, "-" ) ) {
infp = stdin;
}
/* Must open input in binary mode so we can catch chars with high bit set. */
else if ( ( infp = fopen( infile, "rb" ) ) == NULL ) {
fprintf( stderr, "Unable to open input file \"%s\".\n", infile );
continue;
}
while ( EOF != ( c = getUnicodePoint( infp ) ) ) {
putUnicodePoint( c );
}
}
}
int main( int argc, char **argv ) {
extern int optind;
int i, j, c;
char * oneName[2];
for ( i = 0; i < 2357; i++ ) {
ochar[i] = "?";
}
ochar[32] = " ";
ochar[33] = "¡";
ochar[34] = "¢";
ochar[35] = "£";
ochar[36] = "¤";
ochar[37] = "¥";
ochar[38] = "¦";
ochar[39] = "§";
ochar[40] = "¨";
ochar[41] = "©";
ochar[42] = "ª";
ochar[43] = "«";
ochar[44] = "¬";
ochar[45] = "";
ochar[46] = "®";
ochar[47] = "¯";
ochar[48] = "°";
ochar[49] = "±";
ochar[50] = "²";
ochar[51] = "³";
ochar[52] = "´";
ochar[53] = "µ";
ochar[54] = "¶";
ochar[55] = "·";
ochar[56] = "¸";
ochar[57] = "¹";
ochar[58] = "º";
ochar[59] = "»";
ochar[60] = "¼";
ochar[61] = "½";
ochar[62] = "¾";
ochar[63] = "¿";
ochar[64] = "À";
ochar[65] = "Á";
ochar[66] = "Â";
ochar[67] = "Ã";
ochar[68] = "Ä";
ochar[69] = "Å";
ochar[70] = "Æ";
ochar[71] = "Ç";
ochar[72] = "È";
ochar[73] = "É";
ochar[74] = "Ê";
ochar[75] = "Ë";
ochar[76] = "Ì";
ochar[77] = "Í";
ochar[78] = "Î";
ochar[79] = "Ï";
ochar[80] = "Ð";
ochar[81] = "Ñ";
ochar[82] = "Ò";
ochar[83] = "Ó";
ochar[84] = "Ô";
ochar[85] = "Õ";
ochar[86] = "Ö";
ochar[87] = "×";
ochar[88] = "Ø";
ochar[89] = "Ù";
ochar[90] = "Ú";
ochar[91] = "Û";
ochar[92] = "Ü";
ochar[93] = "Ý";
ochar[94] = "Þ";
ochar[95] = "ß";
ochar[96] = "à";
ochar[97] = "á";
ochar[98] = "â";
ochar[99] = "ã";
ochar[100] = "ä";
ochar[101] = "å";
ochar[102] = "æ";
ochar[103] = "ç";
ochar[104] = "è";
ochar[105] = "é";
ochar[106] = "ê";
ochar[107] = "ë";
ochar[108] = "ì";
ochar[109] = "í";
ochar[110] = "î";
ochar[111] = "ï";
ochar[112] = "ð";
ochar[113] = "ñ";
ochar[114] = "ò";
ochar[115] = "ó";
ochar[116] = "ô";
ochar[117] = "õ";
ochar[118] = "ö";
ochar[119] = "÷";
ochar[120] = "ø";
ochar[121] = "ù";
ochar[122] = "ú";
ochar[123] = "û";
ochar[124] = "ü";
ochar[125] = "ý";
ochar[126] = "þ";
ochar[127] = "ÿ";
ochar[210] = "Œ";
ochar[211] = "œ";
ochar[224] = "Š";
ochar[225] = "š";
ochar[248] = "Ÿ";
ochar[253] = "Ž";
ochar[254] = "ž";
ochar[274] = "ƒ";
ochar[582] = "ˆ";
ochar[604] = "˜";
ochar[809] = "Ω";
ochar[832] = "π";
ochar[883] = "–";
ochar[884] = "—";
ochar[888] = "‘";
ochar[889] = "’";
ochar[890] = "‚";
ochar[892] = "“";
ochar[893] = "”";
ochar[894] = "„";
ochar[896] = "†";
ochar[897] = "‡";
ochar[898] = "•";
ochar[902] = "…";
ochar[912] = "‰";
ochar[921] = "‹";
ochar[922] = "›";
ochar[932] = "⁄";
ochar[1036] = "€";
ochar[1154] = "™";
ochar[1378] = "∂";
ochar[1391] = "∏";
ochar[1393] = "∑";
ochar[1402] = "√";
ochar[1406] = "∞";
ochar[1419] = "∫";
ochar[1448] = "≈";
ochar[1472] = "≠";
ochar[1476] = "≤";
ochar[1477] = "≥";
ochar[2346] = "◊";
defaultValue = "?";
while ( -1 != ( c = getopt( argc, argv, "t:d:" ) ) ) {
switch (c) {
case 't':
fprintf( stderr, "Tolerance given as %s.\n", optarg );
tolerance = atoi(optarg);
break;
case 'd':
defaultValue = optarg;
break;
default:
fprintf( stderr, "usage: utf2iso [-t n] [-d s] [files]\n" );
fprintf( stderr, " where n is how many errors to tolerate\n" );
fprintf( stderr, " and s is a default string" );
fprintf( stderr, " for unmappable chars.\n" );
fprintf( stderr, " s can contain something like %%d" );
fprintf( stderr, " or be empty.\n" );
exit( 1 );
}
}
i = 1;
j = optind;
while ( j < argc ) {
argv[i++] = argv[j++];
}
argc -= ( j - i );
if ( argc < 2 ) {
oneName[0] = argv[0];
oneName[1] = "-";
doFiles( 2, oneName );
}
else {
doFiles( argc, argv );
}
return 0;
}