+/*
+ scanstring - like scan file but for a string.
+ This is just a quick butchery of scanfile without thinking too much.
+*/
+int
+scanstring(s) char * s; {
+ int x, val = -1, count = 0; /* Workers */
+ int rc = -1; /* Return code */
+ int pv = -1; /* Pattern-match value */
+ int bytes = 0; /* Total byte count */
+#ifdef UNICODE
+ unsigned int c0, c1; /* First 2 file bytes (for BOM) */
+#endif /* UNICODE */
+ extern int pipesend, filepeek;
+
+ register int i; /* Loop control */
+ int readsize = 0; /* How much to read */
+ int eightbit = 0; /* Number of bytes with 8th bit on */
+ int c0controls = 0; /* C0 non-text control-char counter */
+ int c0noniso = 0; /* C0 non-ISO control-char counter */
+ int c1controls = 0; /* C1 control-character counter */
+ unsigned int c; /* Current character */
+ int runmax = 0; /* Longest run of 0 bytes */
+ int runzero = 0; /* Run of 0 bytes */
+ int pctzero = 0; /* Percentage of 0 bytes */
+ int txtcz = 0;
+
+#ifdef UNICODE
+ int notutf8 = 0; /* Nonzero if definitely not UTF-8 */
+ int utf8state = 0; /* UTF-8 recognizer state */
+ int oddzero = 0; /* Number of 0 bytes in odd postions */
+ int evenzero = 0; /* and in even positions */
+ int lfnul = 0; /* Number of <LF><NUL> sequences */
+ int crlf = 0; /* Number of <CRLF> sequences */
+#else
+ int notutf8 = 1;
+#endif /* UNICODE */
+
+ char * buf = s;
+ if (!s) s = "";
+ count = strlen(s);
+
+#ifdef UNICODE
+ if (bytes == 0 && count > 1) {
+ int incl_cnt = 0;
+
+ /* First look for BOM */
+
+ c0 = (unsigned)((unsigned)buf[0]&0xFF); /* First file byte */
+ c1 = (unsigned)((unsigned)buf[1]&0xFF); /* Second byte */
+
+ if (c0 == 0xFE && c1 == 0xFF) { /* UCS-2 BE */
+ rc = FT_UCS2;
+ val = 0;
+ debug(F111,"scanstring UCS2 BOM BE",ckitoa(val),rc);
+ incl_cnt++;
+ } else if (c0 == 0xFF && c1 == 0xFE) { /* UCS-2 LE */
+ rc = FT_UCS2;
+ val = 1;
+ debug(F111,"scanstring UCS2 BOM LE",ckitoa(val),rc);
+ incl_cnt++;
+ } else if (count > 2) if (c0 == 0xEF && c1 == 0xBB &&
+ (unsigned)((unsigned)buf[2]&0xFF) == 0xBF) {
+ rc = FT_UTF8;
+ debug(F111,"scanstring UTF8 BOM",ckitoa(val),rc);
+ incl_cnt++;
+ }
+ if (incl_cnt) { /* Have BOM */
+ bytes += count;
+ goto xscanstring;
+ }
+ }
+#endif /* UNICODE */
+
+ bytes += count; /* Count bytes read */
+
+ for (i = 0; i < count; i++) { /* For each byte... */
+ c = (unsigned)buf[i]; /* For ease of reference */
+ if (!c) { /* Zero byte? */
+ goto xscanstring; /* Null terminated string */
+ }
+ if ((c & 0x80) == 0) { /* We have a 7-bit byte */
+#ifdef UNICODE
+ if (i > 0 && c == 10) { /* Linefeed */
+ if (buf[i-1] == 0) lfnul++; /* Preceded by NUL */
+ else if (buf[i-1] == 13) crlf++; /* or by CR... */
+ }
+#endif /* UNICODE */
+ if (c < ' ') { /* Check for CO controls */
+ if (c != LF && c != CR && c != HT && c != FF) {
+ c0controls++;
+ if (c != ESC && c != SO && c != SI)
+ c0noniso++;
+ }
+ if ((c == '\032') /* Ctrl-Z */
+ ) {
+ c0controls--;
+ c0noniso--;
+ }
+ }
+#ifdef UNICODE
+ if (!notutf8 && utf8state) { /* In UTF-8 sequence? */
+ utf8state = 0;
+ debug(F000,"scanstring","7-bit byte in UTF8 sequence",c);
+ notutf8++; /* Then it's not UTF-8 */
+ continue;
+ }
+#endif /* UNICODE */
+ } else { /* We have an 8-bit byte */
+ eightbit++; /* Count it */
+ if (c >= 0x80 && c < 0xA0) /* Check for C1 controls */
+ c1controls++;
+#ifdef UNICODE
+ if (!notutf8) { /* If it might still be UTF8... */
+ switch (utf8state) { /* Enter the UTF-8 state machine */
+ case 0: /* First byte... */
+ if ((c & 0xE0) == 0xC0) { /* Tells number of */
+ utf8state = 1; /* subsequent bytes */
+ } else if ((c & 0xF0) == 0xE0) {
+ utf8state = 2;
+ } else if ((c & 0xF8) == 0xF0) {
+ utf8state = 3;
+ } else {
+ notutf8++;
+ }
+ break;
+ case 1: /* Subsequent byte */
+ case 2:
+ case 3:
+ if ((c & 0xC0) != 0x80) { /* Must start with 10 */
+ debug(F000,"scanstring",
+ "bad byte in UTF8 sequence",c);
+ notutf8++;
+ break;
+ }
+ utf8state--; /* Good, one less in this sequence */
+ break;
+ default: /* Shouldn't happen */
+ debug(F111,"scanstring","bad UTF8 state",utf8state);
+ notutf8++;
+ }
+ }
+#endif /* UNICODE */
+ }
+ }
+ if (bytes == 0) /* If nothing was read */
+ return(-1); /* we're done. */
+
+#ifdef UNICODE
+ if (bytes > 100) /* Bytes is not 0 */
+ pctzero = (evenzero + oddzero) / (bytes / 100);
+ else
+ pctzero = ((evenzero + oddzero) * 100) / bytes;
+#endif /* UNICODE */
+
+#ifdef UNICODE
+ x = eightbit ? bytes / 20 : bytes / 4; /* For UCS-2... */
+
+ if (runmax > 2) { /* File has run of more than 2 NULs */
+ debug(F100,"scanstring BIN runmax","",0);
+ rc = FT_BIN; /* so it can't be any kind of text. */
+ goto xscanstring;
+
+ } else if (rc == FT_UCS2 || (rc == FT_UTF8 && runmax == 0)) {
+ goto xscanstring; /* File starts with a BOM */
+
+ } else if (eightbit > 0 && !notutf8) { /* File has 8-bit data */
+ if (runmax > 0) { /* and runs of NULs */
+ debug(F100,"scanstring BIN (nnUTF8) runmax","",0);
+ rc = FT_BIN; /* UTF-8 doesn't have NULs */
+ } else { /* No NULs */
+ debug(F100,"scanstring UTF8 (nnUTF8 + runmax == 0)","",0);
+ rc = FT_UTF8; /* and not not UTF-8, so is UTF-8 */
+ }
+ goto xscanstring;
+ }
+/*
+ It seems to be UCS-2 but let's be more certain since there is no BOM...
+ If the number of 7- and 8-bit characters is approximately equal, it might
+ be a compressed file. In this case we decide based on the name.
+*/
+ if (rc == FT_UCS2) {
+ if (bytes < 100) {
+ if (oddzero != 0 && evenzero != 0) {
+ debug(F100,"scanstring small UCS2 doubtful","",0);
+ rc = FT_BIN;
+ goto xscanstring;
+ } else if (oddzero == 0 && evenzero == 0) {
+ rc = eightbit ? FT_8BIT : FT_7BIT;
+ }
+ }
+ goto xscanstring; /* Seems to be UCS-2 */
+ }
+
+/* If none of the above, it's probably not Unicode. */
+
+ if (!eightbit) { /* It's 7-bit */
+ if (c0controls) { /* This would be strange */
+ if ((c0noniso > 0) && (txtcz == 0)) {
+ debug(F100,"scanstring 7-bit BIN (c0coniso)","",0);
+ rc = FT_BIN;
+ } else {
+ debug(F100,"scanstring 7-bit ISO2022 TEXT (no c0noniso)","",0);
+ rc = FT_7BIT;
+ }
+ } else { /* 7-bit text */
+ debug(F100,"scanstring 7-bit TEXT (no c0controls)","",0);
+ rc = FT_7BIT;
+ }
+ } else if (!c0noniso || txtcz) { /* 8-bit text */
+ debug(F100,"scanstring 8-bit TEXT (no c0noniso)","",0);
+ rc = FT_8BIT;
+ val = c1controls ? 1 : 0;
+ } else { /* 8-bit binary */
+ debug(F100,"scanstring 8-bit BIN (c0noniso)","",0);
+ rc = FT_BIN;
+ }
+
+#else /* !UNICODE */
+
+ if (c0noniso) {
+ debug(F100,"scanstring 8-bit BIN (c0noniso)","",0);
+ rc = FT_BIN;
+ } else if (eightbit) {
+ debug(F100,"scanstring 8-bit TEXT (no c0noniso)","",0);
+ rc = FT_8BIT;
+ val = c1controls ? 1 : 0;
+ } else {
+ debug(F100,"scanstring 7-bit TEXT (no c0noniso)","",0);
+ rc = FT_7BIT;
+ }
+
+#endif /* UNICODE */
+
+ xscanstring:
+ debug(F101,"scanstring result ","",rc);
+ return(rc);
+}
+
+
+