bring scan_cescape up to speed

2017-07-30 12:31:12 +00:00 · 2017-07-30 12:31:12 +00:00 · ac2df2bf20
commit ac2df2bf20
parent 8526ae3d0d
4 changed files with 138 additions and 7 deletions
--- a/textcode/scan_cescape.3
+++ b/textcode/scan_cescape.3
@ -0,0 +1,35 @@
 .TH scan_cescape 3
 .SH NAME
 scan_cescape \- parse C escaped string
 .SH SYNTAX
 .B #include <libowfat/textcode.h>
 size_t \fBscan_cescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
 .SH DESCRIPTION
 scan_cescape parses C escaped text from src into dest.
 Parsing stops at the 0 terminator, invalid input characters or a double
 quote that was not escaped.
 C escape sequences like \\n and \\x0a and \\012 are translated into
 their binary counterparts. The C99 escape sequences \\u and \\U are
 supported and lead to UTF-8 sequences being output.
 scan_cescape will then write the number of bytes in dest into *destlen,
 and return the number of bytes decoded from src.
 dest can be NULL. destlen can be NULL.
 To make sure dest is large enough, either allocate strlen(src)+1 bytes
 or call scan_cescape twice, the first time with dest == NULL (*destlen
 will still be written).
 .SH "RETURN VALUE"
 scan_cescape returns the number of bytes successfully parsed
 from src.
 .SH EXAMPLES
 scan_cescape("test\\n\");",buf,&i) -> return 7, i=5, buf="test\n".
 .SH "SEE ALSO"
 fmt_jsonescape(3), fmt_cescape(3), scan_ldapescape(3)
--- a/textcode/scan_cescape.c
+++ b/textcode/scan_cescape.c
@ -18,25 +18,84 @@ size_t scan_cescape(const char *src,char *dest,size_t *destlen) {
      case 't': c='\t'; break;
      case 'v': c='\v';
      case '\\': break;
-      case 'x':
+      case 'x':	// hex escape; \x0a -> 10
 	{
 	  unsigned char a,b;
 	  a=scan_fromhex(s[i+2]);
 	  b=scan_fromhex(s[i+3]);
-	  if (a<16 && b<16) {
+	  if (a<16) {
 	    if (b<16) {
 	      c=(a<<4)+b;
 	      i+=2;
 	    } else {
 	      c=a;
 	      i+=1;
 	    }
 	  }
 	}
 	break;
      case 'u': // C99 unicode escape: \u000a -> 10
      case 'U': // C99 unicode escape: \U0000000a -> 10
 	{
 	  unsigned int j,k=0,l=(s[i+1]=='U'?10:6);
 	  for (j=2; j<l; ++j) {
 	    unsigned char c=scan_fromhex(s[i+j]);
 	    if (c>=16) // error
 	      goto error;	// don't allow short sequences
 	    k=k*16+c;
 	  }
 	  written+=fmt_utf8(dest?dest+written:0,k);
 	  i+=j-1;
 	  continue;
 	}
      default:
 	if (s[i+1]>='0' && s[i+1]<='7') {	// octal escape; \012 -> 10
 	  unsigned int j,k;
 	  for (k=0,j=1; j<4; ++j) {
 	    unsigned int l=s[i+j]-'0';
 	    if (l<8)
 	      k=k*8+l;
 	    else
 	      break;
 	  }
 	  if (dest) dest[written++]=k;
 	  i+=j-1;
 	  continue;
 	}
 	--i;
      }
      ++i;
-    }
+    } else if (c=='"')
-    dest[written]=c;
+      break;
    if (dest) dest[written]=c;
    ++written;
  }
-  *destlen=written;
+error:
  if (destlen) *destlen=written;
  return i;
 }
 #ifdef UNITTEST
 #include <assert.h>
 #undef UNITTEST
 #include <scan/scan_fromhex.c>
 #include <fmt/fmt_utf8.c>
 int main() {
  size_t dl;
  char buf[100];
  assert(scan_cescape("test\\n\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\n",5));
  /* check hex and octal escaping */
  assert(scan_cescape("test\\x0a\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5));
  assert(scan_cescape("test\\012\");",buf,&dl)==8 && dl==5 && !memcmp(buf,"test\n",5));
  /* check short escape sequences */
  assert(scan_cescape("test\\xa\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5));
  assert(scan_cescape("test\\12\");",buf,&dl)==7 && dl==5 && !memcmp(buf,"test\n",5));
  assert(scan_cescape("test\\1\");",buf,&dl)==6 && dl==5 && !memcmp(buf,"test\1",5));
  /* check unicode */
  assert(scan_cescape("test\\u000a\");",buf,&dl)==10 && dl==5 && !memcmp(buf,"test\n",5));
  assert(scan_cescape("test\\U0000000a\");",buf,&dl)==14 && dl==5 && !memcmp(buf,"test\n",5));
  /* check that short sequences are rejected */
  assert(scan_cescape("test\\Ua\");",buf,&dl)==4 && dl==4 && !memcmp(buf,"test",4));
 }
 #endif
--- a/textcode/scan_jsonescape.3
+++ b/textcode/scan_jsonescape.3
@ -0,0 +1,31 @@
 .TH scan_jsonescape 3
 .SH NAME
 scan_jsonescape \- parse JSON escaped string
 .SH SYNTAX
 .B #include <libowfat/textcode.h>
 size_t \fBscan_jsonescape\fP(const char *\fIsrc\fR,char *\fIdest\fR,size_t* \fIdestlen\fR);
 .SH DESCRIPTION
 scan_jsonescape parses JSON escaped text from src into dest, leaving a
 UTF-8 string in dest. Parsing stops at the 0 terminator, invalid input
 characters or a double quote that was not escaped.
 It will then write the number of bytes in dest into *destlen,
 and return the number of bytes decoded from src.
 dest can be NULL. destlen can be NULL.
 To make sure dest is large enough, either allocate strlen(src)+1 bytes
 or call scan_jsonescape twice, the first time with dest == NULL (*destlen
 will still be written).
 .SH "RETURN VALUE"
 scan_jsonescape returns the number of bytes successfully parsed
 from src.
 .SH EXAMPLES
 scan_jsonescape("test\n\");",buf,&i) -> return 6, i=5, buf="test\n".
 .SH "SEE ALSO"
 fmt_jsonescape(3), scan_cescape(3), scan_ldapescape(3)
--- a/textcode/scan_jsonescape.c
+++ b/textcode/scan_jsonescape.c
@ -73,6 +73,10 @@ abort:
 #ifdef UNITTEST
 #include <assert.h>
 #include <string.h>
 #undef UNITTEST
 #include <scan/scan_fromhex.c>
 #include <scan/scan_utf8.c>
 #include <fmt/fmt_utf8.c>
 int main() {
  char buf[100];
@ -105,3 +109,5 @@ int main() {
  return 0;
 }
 #endif