new utf decoder
This patch replaces current utf decoder with a new one, which is ~50 lines shorter and should be easier to understand. Parsing 5 and 6 sequences, if necessary, requires trivial modification of UTF_SIZ constant and utfbyte, utfmask, utfmin, utfmax arrays.
This commit is contained in:
		 Damian Okrasa
					Damian Okrasa
				
			
				
					committed by
					
						 Roberto E. Vargas Caballero
						Roberto E. Vargas Caballero
					
				
			
			
				
	
			
			
			 Roberto E. Vargas Caballero
						Roberto E. Vargas Caballero
					
				
			
						parent
						
							71328cbcdc
						
					
				
				
					commit
					45b808b88e
				
			
							
								
								
									
										224
									
								
								st.c
									
									
									
									
									
								
							
							
						
						
									
										224
									
								
								st.c
									
									
									
									
									
								
							| @@ -55,6 +55,7 @@ char *argv0; | |||||||
| #define XEMBED_FOCUS_OUT 5 | #define XEMBED_FOCUS_OUT 5 | ||||||
|  |  | ||||||
| /* Arbitrary sizes */ | /* Arbitrary sizes */ | ||||||
|  | #define UTF_INVALID   0xFFFD | ||||||
| #define UTF_SIZ       4 | #define UTF_SIZ       4 | ||||||
| #define ESC_BUF_SIZ   (128*UTF_SIZ) | #define ESC_BUF_SIZ   (128*UTF_SIZ) | ||||||
| #define ESC_ARG_SIZ   16 | #define ESC_ARG_SIZ   16 | ||||||
| @@ -442,10 +443,12 @@ static void selcopy(void); | |||||||
| static void selscroll(int, int); | static void selscroll(int, int); | ||||||
| static void selsnap(int, int *, int *, int); | static void selsnap(int, int *, int *, int); | ||||||
|  |  | ||||||
| static int utf8decode(char *, long *); | static size_t utf8decode(char *, long *, size_t); | ||||||
| static int utf8encode(long *, char *); | static long utf8decodebyte(char, size_t *); | ||||||
| static int utf8size(char *); | static size_t utf8encode(long, char *, size_t); | ||||||
| static int isfullutf8(char *, int); | static char utf8encodebyte(long, size_t); | ||||||
|  | static size_t utf8len(char *); | ||||||
|  | static size_t utf8validate(long *, size_t); | ||||||
|  |  | ||||||
| static ssize_t xwrite(int, char *, size_t); | static ssize_t xwrite(int, char *, size_t); | ||||||
| static void *xmalloc(size_t); | static void *xmalloc(size_t); | ||||||
| @@ -490,6 +493,11 @@ static int oldbutton = 3; /* button event on startup: 3 = release */ | |||||||
| static char *usedfont = NULL; | static char *usedfont = NULL; | ||||||
| static double usedfontsize = 0; | static double usedfontsize = 0; | ||||||
|  |  | ||||||
|  | static uchar utfbyte[UTF_SIZ + 1] = {0x80,    0, 0xC0, 0xE0, 0xF0}; | ||||||
|  | static uchar utfmask[UTF_SIZ + 1] = {0xC0, 0x80, 0xE0, 0xF0, 0xF8}; | ||||||
|  | static long utfmin[UTF_SIZ + 1] = {       0,    0,  0x80,  0x800,  0x10000}; | ||||||
|  | static long utfmax[UTF_SIZ + 1] = {0x10FFFF, 0x7F, 0x7FF, 0xFFFF, 0x10FFFF}; | ||||||
|  |  | ||||||
| /* Font Ring Cache */ | /* Font Ring Cache */ | ||||||
| enum { | enum { | ||||||
| 	FRC_NORMAL, | 	FRC_NORMAL, | ||||||
| @@ -549,128 +557,69 @@ xstrdup(char *s) { | |||||||
| 	return p; | 	return p; | ||||||
| } | } | ||||||
|  |  | ||||||
| int | size_t | ||||||
| utf8decode(char *s, long *u) { | utf8decode(char *c, long *u, size_t clen) { | ||||||
| 	uchar c; | 	size_t i, j, len, type; | ||||||
| 	int i, n, rtn; | 	long udecoded; | ||||||
|  |  | ||||||
| 	rtn = 1; | 	*u = UTF_INVALID; | ||||||
| 	c = *s; | 	if(!clen) | ||||||
| 	if(~c & 0x80) { /* 0xxxxxxx */ | 		return 0; | ||||||
| 		*u = c; | 	udecoded = utf8decodebyte(c[0], &len); | ||||||
| 		return rtn; | 	if(!BETWEEN(len, 1, UTF_SIZ)) | ||||||
| 	} else if((c & 0xE0) == 0xC0) { /* 110xxxxx */ |  | ||||||
| 		*u = c & 0x1F; |  | ||||||
| 		n = 1; |  | ||||||
| 	} else if((c & 0xF0) == 0xE0) { /* 1110xxxx */ |  | ||||||
| 		*u = c & 0x0F; |  | ||||||
| 		n = 2; |  | ||||||
| 	} else if((c & 0xF8) == 0xF0) { /* 11110xxx */ |  | ||||||
| 		*u = c & 0x07; |  | ||||||
| 		n = 3; |  | ||||||
| 	} else { |  | ||||||
| 		goto invalid; |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	for(i = n, ++s; i > 0; --i, ++rtn, ++s) { |  | ||||||
| 		c = *s; |  | ||||||
| 		if((c & 0xC0) != 0x80) /* 10xxxxxx */ |  | ||||||
| 			goto invalid; |  | ||||||
| 		*u <<= 6; |  | ||||||
| 		*u |= c & 0x3F; |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	if((n == 1 && *u < 0x80) || |  | ||||||
| 	   (n == 2 && *u < 0x800) || |  | ||||||
| 	   (n == 3 && *u < 0x10000) || |  | ||||||
| 	   (*u >= 0xD800 && *u <= 0xDFFF)) { |  | ||||||
| 		goto invalid; |  | ||||||
| 	} |  | ||||||
|  |  | ||||||
| 	return rtn; |  | ||||||
| invalid: |  | ||||||
| 	*u = 0xFFFD; |  | ||||||
|  |  | ||||||
| 	return rtn; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| int |  | ||||||
| utf8encode(long *u, char *s) { |  | ||||||
| 	uchar *sp; |  | ||||||
| 	ulong uc; |  | ||||||
| 	int i, n; |  | ||||||
|  |  | ||||||
| 	sp = (uchar *)s; |  | ||||||
| 	uc = *u; |  | ||||||
| 	if(uc < 0x80) { |  | ||||||
| 		*sp = uc; /* 0xxxxxxx */ |  | ||||||
| 		return 1; | 		return 1; | ||||||
| 	} else if(*u < 0x800) { | 	for(i = 1, j = 1; i < clen && j < len; ++i, ++j) { | ||||||
| 		*sp = (uc >> 6) | 0xC0; /* 110xxxxx */ | 		udecoded = (udecoded << 6) | utf8decodebyte(c[i], &type); | ||||||
| 		n = 1; | 		if(type != 0) | ||||||
| 	} else if(uc < 0x10000) { | 			return j; | ||||||
| 		*sp = (uc >> 12) | 0xE0; /* 1110xxxx */ |  | ||||||
| 		n = 2; |  | ||||||
| 	} else if(uc <= 0x10FFFF) { |  | ||||||
| 		*sp = (uc >> 18) | 0xF0; /* 11110xxx */ |  | ||||||
| 		n = 3; |  | ||||||
| 	} else { |  | ||||||
| 		goto invalid; |  | ||||||
| 	} | 	} | ||||||
|  | 	if(j < len) | ||||||
| 	for(i=n,++sp; i>0; --i,++sp) |  | ||||||
| 		*sp = ((uc >> 6*(i-1)) & 0x3F) | 0x80; /* 10xxxxxx */ |  | ||||||
|  |  | ||||||
| 	return n+1; |  | ||||||
| invalid: |  | ||||||
| 	/* U+FFFD */ |  | ||||||
| 	*s++ = '\xEF'; |  | ||||||
| 	*s++ = '\xBF'; |  | ||||||
| 	*s = '\xBD'; |  | ||||||
|  |  | ||||||
| 	return 3; |  | ||||||
| } |  | ||||||
|  |  | ||||||
| /* use this if your buffer is less than UTF_SIZ, it returns 1 if you can decode |  | ||||||
|    UTF-8 otherwise return 0 */ |  | ||||||
| int |  | ||||||
| isfullutf8(char *s, int b) { |  | ||||||
| 	uchar *c1, *c2, *c3; |  | ||||||
|  |  | ||||||
| 	c1 = (uchar *)s; |  | ||||||
| 	c2 = (uchar *)++s; |  | ||||||
| 	c3 = (uchar *)++s; |  | ||||||
| 	if(b < 1) { |  | ||||||
| 		return 0; | 		return 0; | ||||||
| 	} else if((*c1 & 0xE0) == 0xC0 && b == 1) { | 	*u = udecoded; | ||||||
| 		return 0; | 	utf8validate(u, len); | ||||||
| 	} else if((*c1 & 0xF0) == 0xE0 && | 	return len; | ||||||
| 	    ((b == 1) || |  | ||||||
| 	    ((b == 2) && (*c2 & 0xC0) == 0x80))) { |  | ||||||
| 		return 0; |  | ||||||
| 	} else if((*c1 & 0xF8) == 0xF0 && |  | ||||||
| 	    ((b == 1) || |  | ||||||
| 	    ((b == 2) && (*c2 & 0xC0) == 0x80) || |  | ||||||
| 	    ((b == 3) && (*c2 & 0xC0) == 0x80 && (*c3 & 0xC0) == 0x80))) { |  | ||||||
| 		return 0; |  | ||||||
| 	} else { |  | ||||||
| 		return 1; |  | ||||||
| 	} |  | ||||||
| } | } | ||||||
|  |  | ||||||
| int | long | ||||||
| utf8size(char *s) { | utf8decodebyte(char c, size_t *i) { | ||||||
| 	uchar c = *s; | 	for(*i = 0; *i < LEN(utfmask); ++(*i)) | ||||||
|  | 		if(((uchar)c & utfmask[*i]) == utfbyte[*i]) | ||||||
| 	if(~c & 0x80) { | 			return (uchar)c & ~utfmask[*i]; | ||||||
| 		return 1; | 	return 0; | ||||||
| 	} else if((c & 0xE0) == 0xC0) { |  | ||||||
| 		return 2; |  | ||||||
| 	} else if((c & 0xF0) == 0xE0) { |  | ||||||
| 		return 3; |  | ||||||
| 	} else { |  | ||||||
| 		return 4; |  | ||||||
| } | } | ||||||
|  |  | ||||||
|  | size_t | ||||||
|  | utf8encode(long u, char *c, size_t clen) { | ||||||
|  | 	size_t len, i; | ||||||
|  |  | ||||||
|  | 	len = utf8validate(&u, 0); | ||||||
|  | 	if(clen < len) | ||||||
|  | 		return 0; | ||||||
|  | 	for(i = len - 1; i != 0; --i) { | ||||||
|  | 		c[i] = utf8encodebyte(u, 0); | ||||||
|  | 		u >>= 6; | ||||||
|  | 	} | ||||||
|  | 	c[0] = utf8encodebyte(u, len); | ||||||
|  | 	return len; | ||||||
|  | } | ||||||
|  |  | ||||||
|  | char | ||||||
|  | utf8encodebyte(long u, size_t i) { | ||||||
|  | 	return utfbyte[i] | (u & ~utfmask[i]); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | size_t | ||||||
|  | utf8len(char *c) { | ||||||
|  | 	return utf8decode(c, &(long){0}, UTF_SIZ); | ||||||
|  | } | ||||||
|  |  | ||||||
|  | size_t | ||||||
|  | utf8validate(long *u, size_t i) { | ||||||
|  | 	if(!BETWEEN(*u, utfmin[i], utfmax[i]) || BETWEEN(*u, 0xD800, 0xDFFF)) | ||||||
|  | 		*u = UTF_INVALID; | ||||||
|  | 	for(i = 1; *u > utfmax[i]; ++i) | ||||||
|  | 		; | ||||||
|  | 	return i; | ||||||
| } | } | ||||||
|  |  | ||||||
| static void | static void | ||||||
| @@ -984,7 +933,7 @@ getsel(void) { | |||||||
| 				if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) | 				if(!selected(x, y) || (gp->mode & ATTR_WDUMMY)) | ||||||
| 					continue; | 					continue; | ||||||
|  |  | ||||||
| 				size = utf8size(gp->c); | 				size = utf8len(gp->c); | ||||||
| 				memcpy(ptr, gp->c, size); | 				memcpy(ptr, gp->c, size); | ||||||
| 				ptr += size; | 				ptr += size; | ||||||
| 			} | 			} | ||||||
| @@ -1298,7 +1247,7 @@ ttyread(void) { | |||||||
| 	char *ptr; | 	char *ptr; | ||||||
| 	char s[UTF_SIZ]; | 	char s[UTF_SIZ]; | ||||||
| 	int charsize; /* size of utf8 char in bytes */ | 	int charsize; /* size of utf8 char in bytes */ | ||||||
| 	long utf8c; | 	long unicodep; | ||||||
| 	int ret; | 	int ret; | ||||||
|  |  | ||||||
| 	/* append read bytes to unprocessed bytes */ | 	/* append read bytes to unprocessed bytes */ | ||||||
| @@ -1308,9 +1257,8 @@ ttyread(void) { | |||||||
| 	/* process every complete utf8 char */ | 	/* process every complete utf8 char */ | ||||||
| 	buflen += ret; | 	buflen += ret; | ||||||
| 	ptr = buf; | 	ptr = buf; | ||||||
| 	while(buflen >= UTF_SIZ || isfullutf8(ptr,buflen)) { | 	while(charsize = utf8decode(ptr, &unicodep, buflen)) { | ||||||
| 		charsize = utf8decode(ptr, &utf8c); | 		utf8encode(unicodep, s, UTF_SIZ); | ||||||
| 		utf8encode(&utf8c, s); |  | ||||||
| 		tputc(s, charsize); | 		tputc(s, charsize); | ||||||
| 		ptr += charsize; | 		ptr += charsize; | ||||||
| 		buflen -= charsize; | 		buflen -= charsize; | ||||||
| @@ -2414,14 +2362,14 @@ void | |||||||
| tputc(char *c, int len) { | tputc(char *c, int len) { | ||||||
| 	uchar ascii = *c; | 	uchar ascii = *c; | ||||||
| 	bool control = ascii < '\x20' || ascii == 0177; | 	bool control = ascii < '\x20' || ascii == 0177; | ||||||
| 	long u8char; | 	long unicodep; | ||||||
| 	int width; | 	int width; | ||||||
|  |  | ||||||
| 	if(len == 1) { | 	if(len == 1) { | ||||||
| 		width = 1; | 		width = 1; | ||||||
| 	} else { | 	} else { | ||||||
| 		utf8decode(c, &u8char); | 		utf8decode(c, &unicodep, UTF_SIZ); | ||||||
| 		width = wcwidth(u8char); | 		width = wcwidth(unicodep); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	if(IS_SET(MODE_PRINT)) | 	if(IS_SET(MODE_PRINT)) | ||||||
| @@ -3150,7 +3098,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||||||
| 	int frcflags; | 	int frcflags; | ||||||
| 	int u8fl, u8fblen, u8cblen, doesexist; | 	int u8fl, u8fblen, u8cblen, doesexist; | ||||||
| 	char *u8c, *u8fs; | 	char *u8c, *u8fs; | ||||||
| 	long u8char; | 	long unicodep; | ||||||
| 	Font *font = &dc.font; | 	Font *font = &dc.font; | ||||||
| 	FcResult fcres; | 	FcResult fcres; | ||||||
| 	FcPattern *fcpattern, *fontpattern; | 	FcPattern *fcpattern, *fontpattern; | ||||||
| @@ -3293,11 +3241,11 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||||||
| 		oneatatime = font->width != xw.cw; | 		oneatatime = font->width != xw.cw; | ||||||
| 		for(;;) { | 		for(;;) { | ||||||
| 			u8c = s; | 			u8c = s; | ||||||
| 			u8cblen = utf8decode(s, &u8char); | 			u8cblen = utf8decode(s, &unicodep, UTF_SIZ); | ||||||
| 			s += u8cblen; | 			s += u8cblen; | ||||||
| 			bytelen -= u8cblen; | 			bytelen -= u8cblen; | ||||||
|  |  | ||||||
| 			doesexist = XftCharExists(xw.dpy, font->match, u8char); | 			doesexist = XftCharExists(xw.dpy, font->match, unicodep); | ||||||
| 			if(oneatatime || !doesexist || bytelen <= 0) { | 			if(oneatatime || !doesexist || bytelen <= 0) { | ||||||
| 				if(oneatatime || bytelen <= 0) { | 				if(oneatatime || bytelen <= 0) { | ||||||
| 					if(doesexist) { | 					if(doesexist) { | ||||||
| @@ -3329,7 +3277,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||||||
|  |  | ||||||
| 		/* Search the font cache. */ | 		/* Search the font cache. */ | ||||||
| 		for(i = 0; i < frclen; i++) { | 		for(i = 0; i < frclen; i++) { | ||||||
| 			if(XftCharExists(xw.dpy, frc[i].font, u8char) | 			if(XftCharExists(xw.dpy, frc[i].font, unicodep) | ||||||
| 					&& frc[i].flags == frcflags) { | 					&& frc[i].flags == frcflags) { | ||||||
| 				break; | 				break; | ||||||
| 			} | 			} | ||||||
| @@ -3351,7 +3299,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||||||
| 			fcpattern = FcPatternDuplicate(font->pattern); | 			fcpattern = FcPatternDuplicate(font->pattern); | ||||||
| 			fccharset = FcCharSetCreate(); | 			fccharset = FcCharSetCreate(); | ||||||
|  |  | ||||||
| 			FcCharSetAddChar(fccharset, u8char); | 			FcCharSetAddChar(fccharset, unicodep); | ||||||
| 			FcPatternAddCharSet(fcpattern, FC_CHARSET, | 			FcPatternAddCharSet(fcpattern, FC_CHARSET, | ||||||
| 					fccharset); | 					fccharset); | ||||||
| 			FcPatternAddBool(fcpattern, FC_SCALABLE, | 			FcPatternAddBool(fcpattern, FC_SCALABLE, | ||||||
| @@ -3387,7 +3335,7 @@ xdraws(char *s, Glyph base, int x, int y, int charlen, int bytelen) { | |||||||
| 				xp, winy + frc[i].font->ascent, | 				xp, winy + frc[i].font->ascent, | ||||||
| 				(FcChar8 *)u8c, u8cblen); | 				(FcChar8 *)u8c, u8cblen); | ||||||
|  |  | ||||||
| 		xp += xw.cw * wcwidth(u8char); | 		xp += xw.cw * wcwidth(unicodep); | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
| 	/* | 	/* | ||||||
| @@ -3430,7 +3378,7 @@ xdrawcursor(void) { | |||||||
| 	memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); | 	memcpy(g.c, term.line[term.c.y][term.c.x].c, UTF_SIZ); | ||||||
|  |  | ||||||
| 	/* remove the old cursor */ | 	/* remove the old cursor */ | ||||||
| 	sl = utf8size(term.line[oldy][oldx].c); | 	sl = utf8len(term.line[oldy][oldx].c); | ||||||
| 	width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; | 	width = (term.line[oldy][oldx].mode & ATTR_WIDE)? 2 : 1; | ||||||
| 	xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, | 	xdraws(term.line[oldy][oldx].c, term.line[oldy][oldx], oldx, | ||||||
| 			oldy, width, sl); | 			oldy, width, sl); | ||||||
| @@ -3444,7 +3392,7 @@ xdrawcursor(void) { | |||||||
| 				g.bg = defaultfg; | 				g.bg = defaultfg; | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			sl = utf8size(g.c); | 			sl = utf8len(g.c); | ||||||
| 			width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ | 			width = (term.line[term.c.y][curx].mode & ATTR_WIDE)\ | ||||||
| 				? 2 : 1; | 				? 2 : 1; | ||||||
| 			xdraws(g.c, g, term.c.x, term.c.y, width, sl); | 			xdraws(g.c, g, term.c.x, term.c.y, width, sl); | ||||||
| @@ -3516,7 +3464,7 @@ drawregion(int x1, int y1, int x2, int y2) { | |||||||
| 	Glyph base, new; | 	Glyph base, new; | ||||||
| 	char buf[DRAW_BUF_SIZ]; | 	char buf[DRAW_BUF_SIZ]; | ||||||
| 	bool ena_sel = sel.ob.x != -1; | 	bool ena_sel = sel.ob.x != -1; | ||||||
| 	long u8char; | 	long unicodep; | ||||||
|  |  | ||||||
| 	if(sel.alt ^ IS_SET(MODE_ALTSCREEN)) | 	if(sel.alt ^ IS_SET(MODE_ALTSCREEN)) | ||||||
| 		ena_sel = 0; | 		ena_sel = 0; | ||||||
| @@ -3548,7 +3496,7 @@ drawregion(int x1, int y1, int x2, int y2) { | |||||||
| 				base = new; | 				base = new; | ||||||
| 			} | 			} | ||||||
|  |  | ||||||
| 			sl = utf8decode(new.c, &u8char); | 			sl = utf8decode(new.c, &unicodep, UTF_SIZ); | ||||||
| 			memcpy(buf+ib, new.c, sl); | 			memcpy(buf+ib, new.c, sl); | ||||||
| 			ib += sl; | 			ib += sl; | ||||||
| 			ic += (new.mode & ATTR_WIDE)? 2 : 1; | 			ic += (new.mode & ATTR_WIDE)? 2 : 1; | ||||||
| @@ -3707,7 +3655,7 @@ kpress(XEvent *ev) { | |||||||
| 		if(IS_SET(MODE_8BIT)) { | 		if(IS_SET(MODE_8BIT)) { | ||||||
| 			if(*buf < 0177) { | 			if(*buf < 0177) { | ||||||
| 				c = *buf | 0x80; | 				c = *buf | 0x80; | ||||||
| 				len = utf8encode(&c, buf); | 				len = utf8encode(c, buf, UTF_SIZ); | ||||||
| 			} | 			} | ||||||
| 		} else { | 		} else { | ||||||
| 			buf[1] = buf[0]; | 			buf[1] = buf[0]; | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user