Download NHK English news podcast automatically. XML Parser "spxml" is used. This application requires mpod mother board. See also http://mbed.org/users/geodenx/notebook/mpod/
Dependencies: BlinkLed HTTPClient EthernetInterface FatFileSystemCpp MSCFileSystem spxml mbed-rtos mbed
Fork of mpod_nhk_english by
Download NHK English news podcast automatically.
XML Parser "spxml" is used.
This application requires mpod mother board.
See also http://mbed.org/users/geodenx/notebook/mpod/
spxml/spxmlcodec.cpp@4:7dae52cf560f, 2012-08-20 (annotated)
- Committer:
- togayan
- Date:
- Mon Aug 20 13:27:17 2012 +0000
- Revision:
- 4:7dae52cf560f
1st revision
Who changed what in which revision?
User | Revision | Line number | New contents of line |
---|---|---|---|
togayan | 4:7dae52cf560f | 1 | /* |
togayan | 4:7dae52cf560f | 2 | * Copyright 2007 Stephen Liu |
togayan | 4:7dae52cf560f | 3 | * For license terms, see the file COPYING along with this library. |
togayan | 4:7dae52cf560f | 4 | */ |
togayan | 4:7dae52cf560f | 5 | |
togayan | 4:7dae52cf560f | 6 | #include <string.h> |
togayan | 4:7dae52cf560f | 7 | #include <stdlib.h> |
togayan | 4:7dae52cf560f | 8 | #include <ctype.h> |
togayan | 4:7dae52cf560f | 9 | |
togayan | 4:7dae52cf560f | 10 | #include "spxmlcodec.hpp" |
togayan | 4:7dae52cf560f | 11 | #include "spxmlutils.hpp" |
togayan | 4:7dae52cf560f | 12 | |
togayan | 4:7dae52cf560f | 13 | const char * SP_XmlStringCodec :: DEFAULT_ENCODING = "utf-8"; |
togayan | 4:7dae52cf560f | 14 | |
togayan | 4:7dae52cf560f | 15 | const char SP_XmlStringCodec :: XML_CHARS [] = |
togayan | 4:7dae52cf560f | 16 | { '<', '>', '&', '\'', '"' }; |
togayan | 4:7dae52cf560f | 17 | const char * SP_XmlStringCodec :: ESC_CHARS [] = |
togayan | 4:7dae52cf560f | 18 | { "<", ">", "&", "'", """ }; |
togayan | 4:7dae52cf560f | 19 | |
togayan | 4:7dae52cf560f | 20 | int SP_XmlStringCodec :: decode( const char * encoding, const char * encodeValue, |
togayan | 4:7dae52cf560f | 21 | SP_XmlStringBuffer * outBuffer ) |
togayan | 4:7dae52cf560f | 22 | { |
togayan | 4:7dae52cf560f | 23 | int isUtf8 = ( 0 == strcasecmp( encoding, "utf-8" ) ); |
togayan | 4:7dae52cf560f | 24 | |
togayan | 4:7dae52cf560f | 25 | const char * pos = encodeValue; |
togayan | 4:7dae52cf560f | 26 | for( ; '\0' != *pos; ) { |
togayan | 4:7dae52cf560f | 27 | if( '&' == *pos ) { |
togayan | 4:7dae52cf560f | 28 | int index = -1; |
togayan | 4:7dae52cf560f | 29 | int len = 0; |
togayan | 4:7dae52cf560f | 30 | for( int i = 0; i < (int)( sizeof( ESC_CHARS ) / sizeof( ESC_CHARS[0] ) ); i++ ) { |
togayan | 4:7dae52cf560f | 31 | len = strlen( ESC_CHARS[ i ] ); |
togayan | 4:7dae52cf560f | 32 | if( 0 == strncmp( pos, ESC_CHARS[i], len ) ) { |
togayan | 4:7dae52cf560f | 33 | index = i; |
togayan | 4:7dae52cf560f | 34 | break; |
togayan | 4:7dae52cf560f | 35 | } |
togayan | 4:7dae52cf560f | 36 | } |
togayan | 4:7dae52cf560f | 37 | if( index >= 0 ) { |
togayan | 4:7dae52cf560f | 38 | outBuffer->append( XML_CHARS[ index ] ); |
togayan | 4:7dae52cf560f | 39 | pos += len; |
togayan | 4:7dae52cf560f | 40 | } else { |
togayan | 4:7dae52cf560f | 41 | char * next = ""; |
togayan | 4:7dae52cf560f | 42 | int ch = 0; |
togayan | 4:7dae52cf560f | 43 | if( '#' == *( pos + 1 ) ) { |
togayan | 4:7dae52cf560f | 44 | if( 'x' == *( pos + 2 ) ) { |
togayan | 4:7dae52cf560f | 45 | ch = strtol( pos + 3, &next, 16 ); |
togayan | 4:7dae52cf560f | 46 | } else { |
togayan | 4:7dae52cf560f | 47 | ch = strtol( pos + 2, &next, 10 ); |
togayan | 4:7dae52cf560f | 48 | } |
togayan | 4:7dae52cf560f | 49 | } |
togayan | 4:7dae52cf560f | 50 | |
togayan | 4:7dae52cf560f | 51 | // TODO: fully support xml entity, currently only support unicode entity |
togayan | 4:7dae52cf560f | 52 | if( ';' == *next && 0 != ch ) { |
togayan | 4:7dae52cf560f | 53 | if( isUtf8 ) { |
togayan | 4:7dae52cf560f | 54 | SP_XmlUtf8Codec::uni2utf8( ch, outBuffer ); |
togayan | 4:7dae52cf560f | 55 | } else { |
togayan | 4:7dae52cf560f | 56 | outBuffer->append( ch ); |
togayan | 4:7dae52cf560f | 57 | } |
togayan | 4:7dae52cf560f | 58 | pos = next + 1; |
togayan | 4:7dae52cf560f | 59 | } else { |
togayan | 4:7dae52cf560f | 60 | outBuffer->append( *pos++ ); |
togayan | 4:7dae52cf560f | 61 | } |
togayan | 4:7dae52cf560f | 62 | } |
togayan | 4:7dae52cf560f | 63 | } else { |
togayan | 4:7dae52cf560f | 64 | outBuffer->append( *pos++ ); |
togayan | 4:7dae52cf560f | 65 | } |
togayan | 4:7dae52cf560f | 66 | } |
togayan | 4:7dae52cf560f | 67 | |
togayan | 4:7dae52cf560f | 68 | return 0; |
togayan | 4:7dae52cf560f | 69 | } |
togayan | 4:7dae52cf560f | 70 | |
togayan | 4:7dae52cf560f | 71 | int SP_XmlStringCodec :: encode( const char * encoding, const char * decodeValue, |
togayan | 4:7dae52cf560f | 72 | SP_XmlStringBuffer * outBuffer ) |
togayan | 4:7dae52cf560f | 73 | { |
togayan | 4:7dae52cf560f | 74 | int isUtf8 = ( 0 == strcasecmp( encoding, "utf-8" ) ); |
togayan | 4:7dae52cf560f | 75 | |
togayan | 4:7dae52cf560f | 76 | const unsigned char * pos = (unsigned char *)decodeValue; |
togayan | 4:7dae52cf560f | 77 | for( ; '\0' != *pos; pos++ ) { |
togayan | 4:7dae52cf560f | 78 | int index = -1; |
togayan | 4:7dae52cf560f | 79 | for( int i = 0; i < (int)( sizeof( XML_CHARS ) / sizeof( XML_CHARS[0] ) ); i++ ) { |
togayan | 4:7dae52cf560f | 80 | if( XML_CHARS[i] == *pos ) { |
togayan | 4:7dae52cf560f | 81 | index = i; |
togayan | 4:7dae52cf560f | 82 | break; |
togayan | 4:7dae52cf560f | 83 | } |
togayan | 4:7dae52cf560f | 84 | } |
togayan | 4:7dae52cf560f | 85 | if( index >= 0 && '\'' != *pos ) { |
togayan | 4:7dae52cf560f | 86 | outBuffer->append( ESC_CHARS[ index ] ); |
togayan | 4:7dae52cf560f | 87 | } else { |
togayan | 4:7dae52cf560f | 88 | if( isUtf8 ) { |
togayan | 4:7dae52cf560f | 89 | int ch = 0; |
togayan | 4:7dae52cf560f | 90 | int len = SP_XmlUtf8Codec::utf82uni( (unsigned char*)pos, &ch ); |
togayan | 4:7dae52cf560f | 91 | |
togayan | 4:7dae52cf560f | 92 | if( len > 0 ) { |
togayan | 4:7dae52cf560f | 93 | pos += len - 1; |
togayan | 4:7dae52cf560f | 94 | |
togayan | 4:7dae52cf560f | 95 | char temp[ 32 ] = { 0 }; |
togayan | 4:7dae52cf560f | 96 | snprintf( temp, sizeof( temp ), "&#%d;", ch ); |
togayan | 4:7dae52cf560f | 97 | outBuffer->append( temp ); |
togayan | 4:7dae52cf560f | 98 | } else { |
togayan | 4:7dae52cf560f | 99 | outBuffer->append( *pos ); |
togayan | 4:7dae52cf560f | 100 | } |
togayan | 4:7dae52cf560f | 101 | } else { |
togayan | 4:7dae52cf560f | 102 | if( *pos < 32 ) { |
togayan | 4:7dae52cf560f | 103 | char temp[ 32 ] = { 0 }; |
togayan | 4:7dae52cf560f | 104 | snprintf( temp, sizeof( temp ), "&#%d;", *pos ); |
togayan | 4:7dae52cf560f | 105 | outBuffer->append( temp ); |
togayan | 4:7dae52cf560f | 106 | } else { |
togayan | 4:7dae52cf560f | 107 | outBuffer->append( *pos ); |
togayan | 4:7dae52cf560f | 108 | } |
togayan | 4:7dae52cf560f | 109 | } |
togayan | 4:7dae52cf560f | 110 | } |
togayan | 4:7dae52cf560f | 111 | } |
togayan | 4:7dae52cf560f | 112 | |
togayan | 4:7dae52cf560f | 113 | return 0; |
togayan | 4:7dae52cf560f | 114 | } |
togayan | 4:7dae52cf560f | 115 | |
togayan | 4:7dae52cf560f | 116 | int SP_XmlStringCodec :: isNameChar( const char * encoding, char c ) |
togayan | 4:7dae52cf560f | 117 | { |
togayan | 4:7dae52cf560f | 118 | if( 0 == strcasecmp( encoding, "utf-8" ) ) { |
togayan | 4:7dae52cf560f | 119 | return 1; |
togayan | 4:7dae52cf560f | 120 | } else { |
togayan | 4:7dae52cf560f | 121 | return isalnum(c) || c == ':' || c == '-' || c == '.' || c == '_'; |
togayan | 4:7dae52cf560f | 122 | } |
togayan | 4:7dae52cf560f | 123 | } |
togayan | 4:7dae52cf560f | 124 | |
togayan | 4:7dae52cf560f | 125 | //========================================================= |
togayan | 4:7dae52cf560f | 126 | |
togayan | 4:7dae52cf560f | 127 | int SP_XmlUtf8Codec :: utf82uni( const unsigned char * utf8, int * ch ) |
togayan | 4:7dae52cf560f | 128 | { |
togayan | 4:7dae52cf560f | 129 | int len = 0; |
togayan | 4:7dae52cf560f | 130 | |
togayan | 4:7dae52cf560f | 131 | unsigned char c1 = 0, c2 = 0, c3 = 0, c4 = 0; |
togayan | 4:7dae52cf560f | 132 | |
togayan | 4:7dae52cf560f | 133 | if( *utf8 >= 0x80 ) { |
togayan | 4:7dae52cf560f | 134 | c1 = *utf8++; |
togayan | 4:7dae52cf560f | 135 | |
togayan | 4:7dae52cf560f | 136 | if( c1 < 0xE0 ) { // 2 bytes |
togayan | 4:7dae52cf560f | 137 | if( '\0' != ( c2 = *utf8 ) ) { |
togayan | 4:7dae52cf560f | 138 | *ch = ((c1 & 0x1F) << 6) | (c2 & 0x3F); |
togayan | 4:7dae52cf560f | 139 | len = 2; |
togayan | 4:7dae52cf560f | 140 | } |
togayan | 4:7dae52cf560f | 141 | } else if( c1 < 0xF0 ) { // 3 bytes |
togayan | 4:7dae52cf560f | 142 | if( '\0' != ( c2 = *utf8++ ) && '\0' != ( c3 = *utf8 ) ) { |
togayan | 4:7dae52cf560f | 143 | *ch = ((c1 & 0x0F) << 12) | ((c2 & 0x3F) << 6)| (c3 & 0x3F); |
togayan | 4:7dae52cf560f | 144 | len = 3; |
togayan | 4:7dae52cf560f | 145 | } |
togayan | 4:7dae52cf560f | 146 | } else { // 4 bytes |
togayan | 4:7dae52cf560f | 147 | if( '\0' != ( c2 = *utf8++ ) && '\0' != ( c3 = *utf8++ ) |
togayan | 4:7dae52cf560f | 148 | && '\0' != ( c4 = *utf8 ) ) { |
togayan | 4:7dae52cf560f | 149 | *ch = ((c1 & 0x07) << 16) | ((c2 & 0x3F) << 12) |
togayan | 4:7dae52cf560f | 150 | | ((c3 & 0x3F) << 6) | (c4 & 0x3F); |
togayan | 4:7dae52cf560f | 151 | len = 4; |
togayan | 4:7dae52cf560f | 152 | } |
togayan | 4:7dae52cf560f | 153 | } |
togayan | 4:7dae52cf560f | 154 | } |
togayan | 4:7dae52cf560f | 155 | |
togayan | 4:7dae52cf560f | 156 | return len; |
togayan | 4:7dae52cf560f | 157 | } |
togayan | 4:7dae52cf560f | 158 | |
togayan | 4:7dae52cf560f | 159 | void SP_XmlUtf8Codec :: uni2utf8( int ch, SP_XmlStringBuffer * outBuffer ) |
togayan | 4:7dae52cf560f | 160 | { |
togayan | 4:7dae52cf560f | 161 | if( ch < 0x80 ) outBuffer->append( ch ); |
togayan | 4:7dae52cf560f | 162 | else if( ch < 0x800 ) { |
togayan | 4:7dae52cf560f | 163 | outBuffer->append( 0xC0 | ( ch >> 6 ) ); |
togayan | 4:7dae52cf560f | 164 | outBuffer->append( 0x80 | ( ch & 0x3F ) ); |
togayan | 4:7dae52cf560f | 165 | } else if( ch < 0x10000 ) { |
togayan | 4:7dae52cf560f | 166 | outBuffer->append( 0xE0 | ( ch >> 12 ) ); |
togayan | 4:7dae52cf560f | 167 | outBuffer->append( 0x80 | ( ( ch >> 6 ) & 0x3F ) ); |
togayan | 4:7dae52cf560f | 168 | outBuffer->append( 0x80 | ( ch & 0x3F ) ); |
togayan | 4:7dae52cf560f | 169 | } else if( ch < 0x200000 ) { |
togayan | 4:7dae52cf560f | 170 | outBuffer->append( 0xF0 | ( ch >> 18 ) ); |
togayan | 4:7dae52cf560f | 171 | outBuffer->append( 0x80 | ( ( ch >> 12 ) & 0x3F ) ); |
togayan | 4:7dae52cf560f | 172 | outBuffer->append( 0x80 | ( ( ch >> 6 ) & 0x3F ) ); |
togayan | 4:7dae52cf560f | 173 | outBuffer->append( 0x80 | ( ch & 0x3F ) ); |
togayan | 4:7dae52cf560f | 174 | } |
togayan | 4:7dae52cf560f | 175 | } |