Index: src/Makefile.am =================================================================== --- src/Makefile.am (revision 13051) +++ src/Makefile.am (working copy) @@ -24,6 +24,7 @@ m_conf.c m_glob.c m_sched.c \ s_main.c s_inter.c s_file.c s_print.c \ s_loader.c s_path.c s_entry.c s_audio.c s_midi.c \ + s_utf8.c \ d_ugen.c d_ctl.c d_arithmetic.c d_osc.c d_filter.c d_dac.c d_misc.c \ d_math.c d_fft.c d_array.c d_global.c \ d_delay.c d_resample.c \ Index: src/g_editor.c =================================================================== --- src/g_editor.c (revision 13051) +++ src/g_editor.c (working copy) @@ -9,6 +9,7 @@ #include "s_stuff.h" #include "g_canvas.h" #include +#include "s_utf8.h" /*-- moo --*/ void glist_readfrombinbuf(t_glist *x, t_binbuf *b, char *filename, int selectem); @@ -1666,8 +1667,9 @@ gotkeysym = av[1].a_w.w_symbol; else if (av[1].a_type == A_FLOAT) { - char buf[3]; - sprintf(buf, "%c", (int)(av[1].a_w.w_float)); + /*-- moo: assume keynum is a Unicode codepoint; encode as UTF-8 --*/ + char buf[UTF8_MAXBYTES1]; + u8_wc_toutf8_nul(buf, (UCS4)(av[1].a_w.w_float)); gotkeysym = gensym(buf); } else gotkeysym = gensym("?"); Index: src/s_utf8.c =================================================================== --- src/s_utf8.c (revision 0) +++ src/s_utf8.c (revision 0) @@ -0,0 +1,280 @@ +/* + Basic UTF-8 manipulation routines + by Jeff Bezanson + placed in the public domain Fall 2005 + + This code is designed to provide the utilities you need to manipulate + UTF-8 as an internal string encoding. These functions do not perform the + error checking normally needed when handling UTF-8 data, so if you happen + to be from the Unicode Consortium you will want to flay me alive. + I do this because error checking can be performed at the boundaries (I/O), + with these routines reserved for higher performance on data known to be + valid. + + modified by Bryan Jurish (moo) March 2009 + + removed some unneeded functions (escapes, printf etc), added others +*/ +#include +#include +#include +#include +#ifdef WIN32 +#include +#else +#include +#endif + +#include "s_utf8.h" + +static const u_int32_t offsetsFromUTF8[6] = { + 0x00000000UL, 0x00003080UL, 0x000E2080UL, + 0x03C82080UL, 0xFA082080UL, 0x82082080UL +}; + +static const char trailingBytesForUTF8[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 +}; + + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s) +{ + return trailingBytesForUTF8[(unsigned int)(unsigned char)s[0]] + 1; +} + +/* conversions without error checking + only works for valid UTF-8, i.e. no 5- or 6-byte sequences + srcsz = source size in bytes, or -1 if 0-terminated + sz = dest size in # of wide characters + + returns # characters converted + dest will always be L'\0'-terminated, even if there isn't enough room + for all the characters. + if sz = srcsz+1 (i.e. 4*srcsz+4 bytes), there will always be enough space. +*/ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz) +{ + u_int32_t ch; + char *src_end = src + srcsz; + int nb; + int i=0; + + while (i < sz-1) { + nb = trailingBytesForUTF8[(unsigned char)*src]; + if (srcsz == -1) { + if (*src == 0) + goto done_toucs; + } + else { + if (src + nb >= src_end) + goto done_toucs; + } + ch = 0; + switch (nb) { + /* these fall through deliberately */ +#if UTF8_SUPPORT_FULL_UCS4 + case 5: ch += (unsigned char)*src++; ch <<= 6; + case 4: ch += (unsigned char)*src++; ch <<= 6; +#endif + case 3: ch += (unsigned char)*src++; ch <<= 6; + case 2: ch += (unsigned char)*src++; ch <<= 6; + case 1: ch += (unsigned char)*src++; ch <<= 6; + case 0: ch += (unsigned char)*src++; + } + ch -= offsetsFromUTF8[nb]; + dest[i++] = ch; + } + done_toucs: + dest[i] = 0; + return i; +} + +/* srcsz = number of source characters, or -1 if 0-terminated + sz = size of dest buffer in bytes + + returns # characters converted + dest will only be '\0'-terminated if there is enough space. this is + for consistency; imagine there are 2 bytes of space left, but the next + character requires 3 bytes. in this case we could NUL-terminate, but in + general we can't when there's insufficient space. therefore this function + only NUL-terminates if all the characters fit, and there's space for + the NUL as well. + the destination string will never be bigger than the source string. +*/ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz) +{ + u_int32_t ch; + int i = 0; + char *dest_end = dest + sz; + + while (srcsz<0 ? src[i]!=0 : i < srcsz) { + ch = src[i]; + if (ch < 0x80) { + if (dest >= dest_end) + return i; + *dest++ = (char)ch; + } + else if (ch < 0x800) { + if (dest >= dest_end-1) + return i; + *dest++ = (ch>>6) | 0xC0; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x10000) { + if (dest >= dest_end-2) + return i; + *dest++ = (ch>>12) | 0xE0; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + else if (ch < 0x110000) { + if (dest >= dest_end-3) + return i; + *dest++ = (ch>>18) | 0xF0; + *dest++ = ((ch>>12) & 0x3F) | 0x80; + *dest++ = ((ch>>6) & 0x3F) | 0x80; + *dest++ = (ch & 0x3F) | 0x80; + } + i++; + } + if (dest < dest_end) + *dest = '\0'; + return i; +} + +/* moo: get byte length of character number, or 0 if not supported */ +int u8_wc_nbytes(u_int32_t ch) +{ + if (ch < 0x80) return 1; + if (ch < 0x800) return 2; + if (ch < 0x10000) return 3; + if (ch < 0x200000) return 4; +#if UTF8_SUPPORT_FULL_UCS4 + /*-- moo: support full UCS-4 range? --*/ + if (ch < 0x4000000) return 5; + if (ch < 0x7fffffffUL) return 6; +#endif + return 0; /*-- bad input --*/ +} + +int u8_wc_toutf8(char *dest, u_int32_t ch) +{ + if (ch < 0x80) { + dest[0] = (char)ch; + return 1; + } + if (ch < 0x800) { + dest[0] = (ch>>6) | 0xC0; + dest[1] = (ch & 0x3F) | 0x80; + return 2; + } + if (ch < 0x10000) { + dest[0] = (ch>>12) | 0xE0; + dest[1] = ((ch>>6) & 0x3F) | 0x80; + dest[2] = (ch & 0x3F) | 0x80; + return 3; + } + if (ch < 0x110000) { + dest[0] = (ch>>18) | 0xF0; + dest[1] = ((ch>>12) & 0x3F) | 0x80; + dest[2] = ((ch>>6) & 0x3F) | 0x80; + dest[3] = (ch & 0x3F) | 0x80; + return 4; + } + return 0; +} + +/*-- moo --*/ +int u8_wc_toutf8_nul(char *dest, u_int32_t ch) +{ + int sz = u8_wc_toutf8(dest,ch); + dest[sz] = '\0'; + return sz; +} + +/* charnum => byte offset */ +int u8_offset(char *str, int charnum) +{ + int offs=0; + + while (charnum > 0 && str[offs]) { + (void)(isutf(str[++offs]) || isutf(str[++offs]) || + isutf(str[++offs]) || ++offs); + charnum--; + } + return offs; +} + +/* byte offset => charnum */ +int u8_charnum(char *s, int offset) +{ + int charnum = 0, offs=0; + + while (offs < offset && s[offs]) { + (void)(isutf(s[++offs]) || isutf(s[++offs]) || + isutf(s[++offs]) || ++offs); + charnum++; + } + return charnum; +} + +/* reads the next utf-8 sequence out of a string, updating an index */ +u_int32_t u8_nextchar(char *s, int *i) +{ + u_int32_t ch = 0; + int sz = 0; + + do { + ch <<= 6; + ch += (unsigned char)s[(*i)++]; + sz++; + } while (s[*i] && !isutf(s[*i])); + ch -= offsetsFromUTF8[sz-1]; + + return ch; +} + +/* number of characters */ +int u8_strlen(char *s) +{ + int count = 0; + int i = 0; + + while (u8_nextchar(s, &i) != 0) + count++; + + return count; +} + +void u8_inc(char *s, int *i) +{ + (void)(isutf(s[++(*i)]) || isutf(s[++(*i)]) || + isutf(s[++(*i)]) || ++(*i)); +} + +void u8_dec(char *s, int *i) +{ + (void)(isutf(s[--(*i)]) || isutf(s[--(*i)]) || + isutf(s[--(*i)]) || --(*i)); +} + +/*-- moo --*/ +void u8_inc_ptr(char **sp) +{ + (void)(isutf(*(++(*sp))) || isutf(*(++(*sp))) || + isutf(*(++(*sp))) || ++(*sp)); +} + +/*-- moo --*/ +void u8_dec_ptr(char **sp) +{ + (void)(isutf(*(--(*sp))) || isutf(*(--(*sp))) || + isutf(*(--(*sp))) || --(*sp)); +} Index: src/g_rtext.c =================================================================== --- src/g_rtext.c (revision 13051) +++ src/g_rtext.c (working copy) @@ -13,6 +13,7 @@ #include "m_pd.h" #include "s_stuff.h" #include "g_canvas.h" +#include "s_utf8.h" #define LMARGIN 2 @@ -32,10 +33,10 @@ struct _rtext { - char *x_buf; - int x_bufsize; - int x_selstart; - int x_selend; + char *x_buf; /*-- raw byte string, assumed UTF-8 encoded (moo) --*/ + int x_bufsize; /*-- byte length --*/ + int x_selstart; /*-- byte offset --*/ + int x_selend; /*-- byte offset --*/ int x_active; int x_dragfrom; int x_height; @@ -119,6 +120,15 @@ /* LATER deal with tcl-significant characters */ +/* firstone(), lastone() + * + returns byte offset of (first|last) occurrence of 'c' in 's[0..n-1]', or + * -1 if none was found + * + 's' is a raw byte string + * + 'c' is a byte value + * + 'n' is the length (in bytes) of the prefix of 's' to be searched. + * + we could make these functions work on logical characters in utf8 strings, + * but we don't really need to... + */ static int firstone(char *s, int c, int n) { char *s2 = s + n; @@ -155,6 +165,16 @@ of the entire text in pixels. */ + /*-- moo: + * + some variables from the original version have been renamed + * + variables with a "_b" suffix are raw byte strings, lengths, or offsets + * + variables with a "_c" suffix are logical character lengths or offsets + * (assuming valid UTF-8 encoded byte string in x->x_buf) + * + a fair amount of O(n) computations required to convert between raw byte + * offsets (needed by the C side) and logical character offsets (needed by + * the GUI) + */ + /* LATER get this and sys_vgui to work together properly, breaking up messages as needed. As of now, there's a limit of 1950 characters, imposed by sys_vgui(). */ @@ -171,14 +191,16 @@ { t_float dispx, dispy; char smallbuf[200], *tempbuf; - int outchars = 0, nlines = 0, ncolumns = 0, + int outchars_b = 0, nlines = 0, ncolumns = 0, pixwide, pixhigh, font, fontwidth, fontheight, findx, findy; int reportedindex = 0; t_canvas *canvas = glist_getcanvas(x->x_glist); - int widthspec = x->x_text->te_width; - int widthlimit = (widthspec ? widthspec : BOXWIDTH); - int inindex = 0; - int selstart = 0, selend = 0; + int widthspec_c = x->x_text->te_width; + int widthlimit_c = (widthspec_c ? widthspec_c : BOXWIDTH); + int inindex_b = 0; + int inindex_c = 0; + int selstart_b = 0, selend_b = 0; + int x_bufsize_c = u8_charnum(x->x_buf, x->x_bufsize); /* if we're a GOP (the new, "goprect" style) borrow the font size from the inside to preserve the spacing */ if (pd_class(&x->x_text->te_pd) == canvas_class && @@ -193,65 +215,76 @@ if (x->x_bufsize >= 100) tempbuf = (char *)t_getbytes(2 * x->x_bufsize + 1); else tempbuf = smallbuf; - while (x->x_bufsize - inindex > 0) + while (x_bufsize_c - inindex_c > 0) { - int inchars = x->x_bufsize - inindex; - int maxindex = (inchars > widthlimit ? widthlimit : inchars); + int inchars_b = x->x_bufsize - inindex_b; + int inchars_c = x_bufsize_c - inindex_c; + int maxindex_c = (inchars_c > widthlimit_c ? widthlimit_c : inchars_c); + int maxindex_b = u8_offset(x->x_buf + inindex_b, maxindex_c); int eatchar = 1; - int foundit = firstone(x->x_buf + inindex, '\n', maxindex); - if (foundit < 0) + int foundit_b = firstone(x->x_buf + inindex_b, '\n', maxindex_b); + int foundit_c; + if (foundit_b < 0) { - if (inchars > widthlimit) + if (inchars_c > widthlimit_c) { - foundit = lastone(x->x_buf + inindex, ' ', maxindex); - if (foundit < 0) + foundit_b = lastone(x->x_buf + inindex_b, ' ', maxindex_b); + if (foundit_b < 0) { - foundit = maxindex; + foundit_b = maxindex_b; + foundit_c = maxindex_c; eatchar = 0; } + else + foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b); } else { - foundit = inchars; + foundit_b = inchars_b; + foundit_c = inchars_c; eatchar = 0; } } + else + foundit_c = u8_charnum(x->x_buf + inindex_b, foundit_b); + if (nlines == findy) { int actualx = (findx < 0 ? 0 : - (findx > foundit ? foundit : findx)); - *indexp = inindex + actualx; + (findx > foundit_c ? foundit_c : findx)); + *indexp = inindex_b + u8_offset(x->x_buf + inindex_b, actualx); reportedindex = 1; } - strncpy(tempbuf+outchars, x->x_buf + inindex, foundit); - if (x->x_selstart >= inindex && - x->x_selstart <= inindex + foundit + eatchar) - selstart = x->x_selstart + outchars - inindex; - if (x->x_selend >= inindex && - x->x_selend <= inindex + foundit + eatchar) - selend = x->x_selend + outchars - inindex; - outchars += foundit; - inindex += (foundit + eatchar); - if (inindex < x->x_bufsize) - tempbuf[outchars++] = '\n'; - if (foundit > ncolumns) - ncolumns = foundit; + strncpy(tempbuf+outchars_b, x->x_buf + inindex_b, foundit_b); + if (x->x_selstart >= inindex_b && + x->x_selstart <= inindex_b + foundit_b + eatchar) + selstart_b = x->x_selstart + outchars_b - inindex_b; + if (x->x_selend >= inindex_b && + x->x_selend <= inindex_b + foundit_b + eatchar) + selend_b = x->x_selend + outchars_b - inindex_b; + outchars_b += foundit_b; + inindex_b += (foundit_b + eatchar); + inindex_c += (foundit_c + eatchar); + if (inindex_b < x->x_bufsize) + tempbuf[outchars_b++] = '\n'; + if (foundit_c > ncolumns) + ncolumns = foundit_c; nlines++; } if (!reportedindex) - *indexp = outchars; + *indexp = outchars_b; dispx = text_xpix(x->x_text, x->x_glist); dispy = text_ypix(x->x_text, x->x_glist); if (nlines < 1) nlines = 1; - if (!widthspec) + if (!widthspec_c) { while (ncolumns < 3) { - tempbuf[outchars++] = ' '; + tempbuf[outchars_b++] = ' '; ncolumns++; } } - else ncolumns = widthspec; + else ncolumns = widthspec_c; pixwide = ncolumns * fontwidth + (LMARGIN + RMARGIN); pixhigh = nlines * fontheight + (TMARGIN + BMARGIN); @@ -259,31 +292,32 @@ sys_vgui("pdtk_text_new .x%lx.c {%s %s text} %f %f {%.*s} %d %s\n", canvas, x->x_tag, rtext_gettype(x)->s_name, dispx + LMARGIN, dispy + TMARGIN, - outchars, tempbuf, sys_hostfontsize(font), + outchars_b, tempbuf, sys_hostfontsize(font), (glist_isselected(x->x_glist, &x->x_glist->gl_gobj)? "blue" : "black")); else if (action == SEND_UPDATE) { sys_vgui("pdtk_text_set .x%lx.c %s {%.*s}\n", - canvas, x->x_tag, outchars, tempbuf); + canvas, x->x_tag, outchars_b, tempbuf); if (pixwide != x->x_drawnwidth || pixhigh != x->x_drawnheight) text_drawborder(x->x_text, x->x_glist, x->x_tag, pixwide, pixhigh, 0); if (x->x_active) { - if (selend > selstart) + if (selend_b > selstart_b) { sys_vgui(".x%lx.c select from %s %d\n", canvas, - x->x_tag, selstart); + x->x_tag, u8_charnum(x->x_buf, selstart_b)); sys_vgui(".x%lx.c select to %s %d\n", canvas, - x->x_tag, selend + (sys_oldtclversion ? 0 : -1)); + x->x_tag, u8_charnum(x->x_buf, selend_b) + + (sys_oldtclversion ? 0 : -1)); sys_vgui(".x%lx.c focus \"\"\n", canvas); } else { sys_vgui(".x%lx.c select clear\n", canvas); sys_vgui(".x%lx.c icursor %s %d\n", canvas, x->x_tag, - selstart); + u8_charnum(x->x_buf, selstart_b)); sys_vgui(".x%lx.c focus %s\n", canvas, x->x_tag); } } @@ -448,12 +482,12 @@ .... } */ if (x->x_selstart && (x->x_selstart == x->x_selend)) - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); } else if (n == 127) /* delete */ { if (x->x_selend < x->x_bufsize && (x->x_selstart == x->x_selend)) - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); } ndel = x->x_selend - x->x_selstart; @@ -466,7 +500,13 @@ /* at Guenter's suggestion, use 'n>31' to test wither a character might be printable in whatever 8-bit character set we find ourselves. */ - if (n == '\n' || (n > 31 && n != 127)) +/*-- moo: + ... but test with "<" rather than "!=" in order to accomodate unicode + codepoints for n (which we get since Tk is sending the "%A" substitution + for bind ), effectively reducing the coverage of this clause to 7 + bits. Case n>127 is covered by the next clause. +*/ + if (n == '\n' || (n > 31 && n < 127)) { newsize = x->x_bufsize+1; x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize); @@ -476,20 +516,39 @@ x->x_bufsize = newsize; x->x_selstart = x->x_selstart + 1; } + /*--moo: check for unicode codepoints beyond 7-bit ASCII --*/ + else if (n > 127) + { + int ch_nbytes = u8_wc_nbytes(n); + newsize = x->x_bufsize + ch_nbytes; + x->x_buf = resizebytes(x->x_buf, x->x_bufsize, newsize); + for (i = x->x_bufsize; i > x->x_selstart; i--) + x->x_buf[i] = x->x_buf[i-1]; + x->x_bufsize = newsize; + /*-- moo: assume canvas_key() has encoded keysym as UTF-8 */ + strncpy(x->x_buf+x->x_selstart, keysym->s_name, ch_nbytes); + x->x_selstart = x->x_selstart + ch_nbytes; + } x->x_selend = x->x_selstart; x->x_glist->gl_editor->e_textdirty = 1; } else if (!strcmp(keysym->s_name, "Right")) { if (x->x_selend == x->x_selstart && x->x_selstart < x->x_bufsize) - x->x_selend = x->x_selstart = x->x_selstart + 1; + { + u8_inc(x->x_buf, &x->x_selstart); + x->x_selend = x->x_selstart; + } else x->x_selstart = x->x_selend; } else if (!strcmp(keysym->s_name, "Left")) { if (x->x_selend == x->x_selstart && x->x_selstart > 0) - x->x_selend = x->x_selstart = x->x_selstart - 1; + { + u8_dec(x->x_buf, &x->x_selstart); + x->x_selend = x->x_selstart; + } else x->x_selend = x->x_selstart; } @@ -497,18 +556,18 @@ else if (!strcmp(keysym->s_name, "Up")) { if (x->x_selstart) - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); while (x->x_selstart > 0 && x->x_buf[x->x_selstart] != '\n') - x->x_selstart--; + u8_dec(x->x_buf, &x->x_selstart); x->x_selend = x->x_selstart; } else if (!strcmp(keysym->s_name, "Down")) { while (x->x_selend < x->x_bufsize && x->x_buf[x->x_selend] != '\n') - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); if (x->x_selend < x->x_bufsize) - x->x_selend++; + u8_inc(x->x_buf, &x->x_selend); x->x_selstart = x->x_selend; } rtext_senditup(x, SEND_UPDATE, &w, &h, &indx); Index: src/s_utf8.h =================================================================== --- src/s_utf8.h (revision 0) +++ src/s_utf8.h (revision 0) @@ -0,0 +1,88 @@ +#ifndef S_UTF8_H +#define S_UTF8_H + +/*--moo--*/ +#ifndef u_int32_t +# define u_int32_t unsigned int +#endif + +#ifndef UCS4 +# define UCS4 u_int32_t +#endif + +/* UTF8_SUPPORT_FULL_UCS4 + * define this to support the full potential range of UCS-4 codepoints + * (in anticipation of a future UTF-8 standard) + */ +/*#define UTF8_SUPPORT_FULL_UCS4 1*/ +#undef UTF8_SUPPORT_FULL_UCS4 + +/* UTF8_MAXBYTES + * maximum number of bytes required to represent a single character in UTF-8 + * + * UTF8_MAXBYTES1 = UTF8_MAXBYTES+1 + * maximum bytes per character including NUL terminator + */ +#ifdef UTF8_SUPPORT_FULL_UCS4 +# ifndef UTF8_MAXBYTES +# define UTF8_MAXBYTES 6 +# endif +# ifndef UTF8_MAXBYTES1 +# define UTF8_MAXBYTES1 7 +# endif +#else +# ifndef UTF8_MAXBYTES +# define UTF8_MAXBYTES 4 +# endif +# ifndef UTF8_MAXBYTES1 +# define UTF8_MAXBYTES1 5 +# endif +#endif +/*--/moo--*/ + +/* is c the start of a utf8 sequence? */ +#define isutf(c) (((c)&0xC0)!=0x80) + +/* convert UTF-8 data to wide character */ +int u8_toucs(u_int32_t *dest, int sz, char *src, int srcsz); + +/* the opposite conversion */ +int u8_toutf8(char *dest, int sz, u_int32_t *src, int srcsz); + +/* moo: get byte length of character number, or 0 if not supported */ +int u8_wc_nbytes(u_int32_t ch); + +/* moo: compute required storage for UTF-8 encoding of 's[0..n-1]' */ +int u8_wcs_nbytes(u_int32_t *ucs, int size); + +/* single character to UTF-8, no NUL termination */ +int u8_wc_toutf8(char *dest, u_int32_t ch); + +/* moo: single character to UTF-8, with NUL termination */ +int u8_wc_toutf8_nul(char *dest, u_int32_t ch); + +/* character number to byte offset */ +int u8_offset(char *str, int charnum); + +/* byte offset to character number */ +int u8_charnum(char *s, int offset); + +/* return next character, updating an index variable */ +u_int32_t u8_nextchar(char *s, int *i); + +/* move to next character */ +void u8_inc(char *s, int *i); + +/* move to previous character */ +void u8_dec(char *s, int *i); + +/* moo: move pointer to next character */ +void u8_inc_ptr(char **sp); + +/* moo: move pointer to previous character */ +void u8_dec_ptr(char **sp); + +/* returns length of next utf-8 sequence */ +int u8_seqlen(char *s); + +#endif /* S_UTF8_H */