Commits

Serge Zaitsev  committed d68942b

skip binary (non-utf8) files

  • Participants
  • Parent commits efd8d96

Comments (0)

Files changed (2)

 
 void scan_file_cb(const char *path, void *arg) {
 	struct index_writer *writer = (struct index_writer *) arg;
-	index_writer_add(writer, path);
+	if (index_writer_add(writer, path) != 0) {
+		printf("skipped: %s\n", path);
+	} else {
+		printf("indexed: %s\n", path);
+	}
 }
 
 int scandirs(const char *root, scan_cb cb, void *arg) {
 			return -errno;
 		}
 		if (S_ISREG(st.st_mode)) {
-			printf("indexing %s\n", path);
 			cb(path, arg);
 		} else if (S_ISDIR(st.st_mode)) {
 			scandirs(path, cb, arg);
 	free(writer);
 }
 
+static int is_valid_utf(uint32_t trigram) {
+	uint8_t c1 = (trigram >> 8) & 0xff;
+	uint8_t c2 = (trigram & 0xff);
+	if (c1 < 0x80) {
+		return c2 < 0x80 || (0xc0 <= c2 && c2 < 0xf8);
+	} else if (c1< 0xc0) {
+		return c2 < 0xf8;
+	} else if (c1 < 0xf8) {
+		return 0x80 <= c2 && c2 <= 0xc0;
+	} else {
+		return 0;
+	}
+}
+
 int index_writer_add(struct index_writer *writer, const char *path) {
 	FILE *f;
 	unsigned char buf[BUFSIZ];
 		for (i = 0; i < n; i++) {
 			trigram = (trigram << 8) & MAX_TRIGRAM;
 			trigram = trigram | buf[i];
+
+			if (is_valid_utf(trigram) == 0) {
+				return -EINVAL;
+			}
+
 			if (skip_bytes > 0) {
 				skip_bytes--;
 			} else {