diff --git a/textcode.h b/textcode.h index 71829fe..b10cf3a 100644 --- a/textcode.h +++ b/textcode.h @@ -49,7 +49,7 @@ size_t fmt_foldwhitespace(char* dest,const char* src,size_t len); size_t fmt_ldapescape(char* dest,const char* src,size_t len); size_t fmt_ldapescape2(char* dest,const char* src,size_t len,const char* escapeme); /* Encode JSON string from UTF-8; will backslash-escape the bare minimum. - * Will not verify that the input is valid UTF-8! + * Invalid UTF-8 in input will output as valid UTF-8 for each byte * Worst case: len*6 */ size_t fmt_jsonescape(char* dest,const char* src,size_t len); diff --git a/textcode/fmt_jsonescape.c b/textcode/fmt_jsonescape.c index eb97b10..b86a471 100644 --- a/textcode/fmt_jsonescape.c +++ b/textcode/fmt_jsonescape.c @@ -42,8 +42,13 @@ escape: /* UTF-8! Convert to surrogate pair if needed. */ uint32_t u; size_t j=scan_utf8_sem((const char*)s+i,len-i,&u); - if (j==0) /* Invalid UTF-8! Abort! */ - return written; + if (j==0) { /* Invalid UTF-8! Try to limp on! */ + written+=fmt_utf8(dest?dest+written:0,s[i]); + break; + } + /* It turns out we are not required to escape these. + * So we won't. */ +#if 0 if (u>0xffff) { if (dest) { dest[written ]='\\'; @@ -54,7 +59,9 @@ escape: fmt_xlong(dest+written+8,0xdc00 + (u & 0x3ff)); } written+=12; - } else { + } else +#endif + { if (dest) memcpy(dest+written,s+i,j); written+=j; } @@ -83,6 +90,9 @@ int main() { /* test escaping of unprintable characters */ assert(fmt_jsonescape(buf,"\001x",2)==7 && !memcmp(buf,"\\u0001x",7)); /* test conversion of large UTF-8 chars to UTF-16 surrogate pairs (poop emoji) */ - assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13)); + /* EDIT: this escaping is not actually needed, so we aren't doing it + * anymore. This test will fail now: + assert(fmt_jsonescape(buf,"\xf0\x9f\x92\xa9x",5)==13 && !memcmp(buf,"\\ud83d\\udca9x",13)); */ + assert(fmt_jsonescape(buf,"a\x81x",3)==4 && !memcmp(buf,"a\xc2\x81x",4)); } #endif