/** * Convert a file of UTF-8 to HTML. */ #include #include #include #include #include static int tolerance = 100; static char *defaultValue; static char *ochar[2347]; int getUnicodePoint( FILE * infp ) { int c; int answer; int mask; int topmask; int bytes; c = fgetc( infp ); answer = c; if ( EOF != c && 0x80 & c ) { bytes = 0; topmask = 0xff; for ( mask = 0x80; c & mask; mask >>= 1 ) { topmask >>= 1; bytes++; } bytes--; answer = ( c & topmask ) << ( bytes * 6 ); while ( 0 < bytes ) { c = fgetc( infp ); bytes--; answer += ( c & 0x3f ) << ( bytes * 6 ); } assert( EOF != answer ); } return answer; } void replace( int c ) { fprintf( stderr, "Unicode code point %d has no equivalent HTML entity.\n", c ); if ( 0 == tolerance ) { fprintf( stderr, "Too many unmappable characters. Quitting.\n" ); exit( 1 ); } tolerance--; /*printf( defaultValue, c );*/ printf( defaultValue ); } void putUnicodePoint( int c ) { if ( c < 128 ) { putchar( c ); } else if ( c < 1000 ) { if ( "?" == ochar[c - 128] ) { replace( c ); } else { printf( ochar[c - 128] ); } } else if ( c < 10000 ) { if ( "?" == ochar[c - 7200 - 128] ) { replace( c ); } else { printf( ochar[c - 7200 - 128] ); } } else { if ( "?" == ochar[c - 61900] ) { replace( c ); } else { printf( ochar[c - 61900] ); } } } void doFiles( int argc, char **argv ) { FILE *infp; char *infile; int c; while ( --argc > 0 ) { infile = *++argv; if ( 0 == strcmp( infile, "-" ) ) { infp = stdin; } /* Must open input in binary mode so we can catch chars with high bit set. */ else if ( ( infp = fopen( infile, "rb" ) ) == NULL ) { fprintf( stderr, "Unable to open input file \"%s\".\n", infile ); continue; } while ( EOF != ( c = getUnicodePoint( infp ) ) ) { putUnicodePoint( c ); } } } int main( int argc, char **argv ) { extern int optind; int i, j, c; char * oneName[2]; for ( i = 0; i < 2357; i++ ) { ochar[i] = "?"; } ochar[32] = " "; ochar[33] = "¡"; ochar[34] = "¢"; ochar[35] = "£"; ochar[36] = "¤"; ochar[37] = "¥"; ochar[38] = "¦"; ochar[39] = "§"; ochar[40] = "¨"; ochar[41] = "©"; ochar[42] = "ª"; ochar[43] = "«"; ochar[44] = "¬"; ochar[45] = "­"; ochar[46] = "®"; ochar[47] = "¯"; ochar[48] = "°"; ochar[49] = "±"; ochar[50] = "²"; ochar[51] = "³"; ochar[52] = "´"; ochar[53] = "µ"; ochar[54] = "¶"; ochar[55] = "·"; ochar[56] = "¸"; ochar[57] = "¹"; ochar[58] = "º"; ochar[59] = "»"; ochar[60] = "¼"; ochar[61] = "½"; ochar[62] = "¾"; ochar[63] = "¿"; ochar[64] = "À"; ochar[65] = "Á"; ochar[66] = "Â"; ochar[67] = "Ã"; ochar[68] = "Ä"; ochar[69] = "Å"; ochar[70] = "Æ"; ochar[71] = "Ç"; ochar[72] = "È"; ochar[73] = "É"; ochar[74] = "Ê"; ochar[75] = "Ë"; ochar[76] = "Ì"; ochar[77] = "Í"; ochar[78] = "Î"; ochar[79] = "Ï"; ochar[80] = "Ð"; ochar[81] = "Ñ"; ochar[82] = "Ò"; ochar[83] = "Ó"; ochar[84] = "Ô"; ochar[85] = "Õ"; ochar[86] = "Ö"; ochar[87] = "×"; ochar[88] = "Ø"; ochar[89] = "Ù"; ochar[90] = "Ú"; ochar[91] = "Û"; ochar[92] = "Ü"; ochar[93] = "Ý"; ochar[94] = "Þ"; ochar[95] = "ß"; ochar[96] = "à"; ochar[97] = "á"; ochar[98] = "â"; ochar[99] = "ã"; ochar[100] = "ä"; ochar[101] = "å"; ochar[102] = "æ"; ochar[103] = "ç"; ochar[104] = "è"; ochar[105] = "é"; ochar[106] = "ê"; ochar[107] = "ë"; ochar[108] = "ì"; ochar[109] = "í"; ochar[110] = "î"; ochar[111] = "ï"; ochar[112] = "ð"; ochar[113] = "ñ"; ochar[114] = "ò"; ochar[115] = "ó"; ochar[116] = "ô"; ochar[117] = "õ"; ochar[118] = "ö"; ochar[119] = "÷"; ochar[120] = "ø"; ochar[121] = "ù"; ochar[122] = "ú"; ochar[123] = "û"; ochar[124] = "ü"; ochar[125] = "ý"; ochar[126] = "þ"; ochar[127] = "ÿ"; ochar[210] = "Œ"; ochar[211] = "œ"; ochar[224] = "Š"; ochar[225] = "š"; ochar[248] = "Ÿ"; ochar[253] = "Ž"; ochar[254] = "ž"; ochar[274] = "ƒ"; ochar[582] = "ˆ"; ochar[604] = "˜"; ochar[809] = "Ω"; ochar[832] = "π"; ochar[883] = "–"; ochar[884] = "—"; ochar[888] = "‘"; ochar[889] = "’"; ochar[890] = "‚"; ochar[892] = "“"; ochar[893] = "”"; ochar[894] = "„"; ochar[896] = "†"; ochar[897] = "‡"; ochar[898] = "•"; ochar[902] = "…"; ochar[912] = "‰"; ochar[921] = "‹"; ochar[922] = "›"; ochar[932] = "⁄"; ochar[1036] = "€"; ochar[1154] = "™"; ochar[1378] = "∂"; ochar[1391] = "∏"; ochar[1393] = "∑"; ochar[1402] = "√"; ochar[1406] = "∞"; ochar[1419] = "∫"; ochar[1448] = "≈"; ochar[1472] = "≠"; ochar[1476] = "≤"; ochar[1477] = "≥"; ochar[2346] = "◊"; defaultValue = "?"; while ( -1 != ( c = getopt( argc, argv, "t:d:" ) ) ) { switch (c) { case 't': fprintf( stderr, "Tolerance given as %s.\n", optarg ); tolerance = atoi(optarg); break; case 'd': defaultValue = optarg; break; default: fprintf( stderr, "usage: utf2iso [-t n] [-d s] [files]\n" ); fprintf( stderr, " where n is how many errors to tolerate\n" ); fprintf( stderr, " and s is a default string" ); fprintf( stderr, " for unmappable chars.\n" ); fprintf( stderr, " s can contain something like %%d" ); fprintf( stderr, " or be empty.\n" ); exit( 1 ); } } i = 1; j = optind; while ( j < argc ) { argv[i++] = argv[j++]; } argc -= ( j - i ); if ( argc < 2 ) { oneName[0] = argv[0]; oneName[1] = "-"; doFiles( 2, oneName ); } else { doFiles( argc, argv ); } return 0; }