From 048ae3ae9a4a6b71706c6f36c8716c6ced79e1df Mon Sep 17 00:00:00 2001 From: Lars Mueller Date: Sat, 31 May 2025 23:00:57 +0200 Subject: [PATCH] Integrate PUC Lua's `utf8` library into Luanti --- CMakeLists.txt | 1 + doc/lua_api.md | 17 ++++++++ games/devtest/mods/unittests/misc.lua | 7 +++ lib/lutf8/CMakeLists.txt | 5 +++ lib/lutf8/lutf8.c | 61 +++++++++++++++++---------- lib/lutf8/lutf8.h | 6 +++ src/CMakeLists.txt | 3 ++ src/script/cpp_api/s_base.cpp | 5 +++ src/script/cpp_api/s_security.cpp | 4 ++ 9 files changed, 87 insertions(+), 22 deletions(-) create mode 100644 lib/lutf8/CMakeLists.txt create mode 100644 lib/lutf8/lutf8.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 70a027f57..ae4258a25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -290,6 +290,7 @@ if(NOT USE_LUAJIT) add_subdirectory(lib/bitop) endif() add_subdirectory(lib/sha256) +add_subdirectory(lib/lutf8) if(BUILD_UNITTESTS OR BUILD_BENCHMARKS) add_subdirectory(lib/catch2) diff --git a/doc/lua_api.md b/doc/lua_api.md index 3a6da1bdb..5e60ec228 100644 --- a/doc/lua_api.md +++ b/doc/lua_api.md @@ -11972,6 +11972,23 @@ Functions: bit.tobit, bit.tohex, bit.bnot, bit.band, bit.bor, bit.bxor, bit.lshi See http://bitop.luajit.org/ for advanced information. +UTF-8 Library +------------- + +Constants: + +* `utf8.charpattern` + +Functions: + +* `utf8.char(...)` +* `utf8.codes(s [, lax])` +* `utf8.codepoint(s [, i [, j [, lax]]])` +* `utf8.len(s [, i [, j [, lax]]])` +* `utf8.offset(s, n [, i])` + +See [the Lua 5.4 reference manual](https://www.lua.org/manual/5.4/manual.html#6.5) for more information. + Tracy Profiler -------------- diff --git a/games/devtest/mods/unittests/misc.lua b/games/devtest/mods/unittests/misc.lua index 28cc2c1eb..8f24b6347 100644 --- a/games/devtest/mods/unittests/misc.lua +++ b/games/devtest/mods/unittests/misc.lua @@ -341,3 +341,10 @@ local function test_ipc_poll(cb) print("delta: " .. (core.get_us_time() - t0) .. "us") end unittests.register("test_ipc_poll", test_ipc_poll) + +local function test_utf8() + assert(#string.char(0xc3, 0xa4):match(utf8.charpattern) == 2) + assert(("\0"):match(utf8.charpattern) == "\0") + assert(utf8.char(0x11, 0x22, 0x10abcd) == string.char(0x11, 0x22, 0xf4, 0x8a, 0xaf, 0x8d)) +end +unittests.register("test_utf8", test_utf8) diff --git a/lib/lutf8/CMakeLists.txt b/lib/lutf8/CMakeLists.txt new file mode 100644 index 000000000..981125550 --- /dev/null +++ b/lib/lutf8/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(lutf8 STATIC lutf8.c) +target_include_directories(lutf8 PRIVATE ${LUA_INCLUDE_DIR}) + +set(LUA_UTF8_INCLUDE_DIR ${CMAKE_CURRENT_SOURCE_DIR} PARENT_SCOPE) +set(LUA_UTF8_LIBRARY lutf8 PARENT_SCOPE) diff --git a/lib/lutf8/lutf8.c b/lib/lutf8/lutf8.c index 4c9784e09..65d268422 100644 --- a/lib/lutf8/lutf8.c +++ b/lib/lutf8/lutf8.c @@ -1,25 +1,21 @@ -/* -** $Id: lutf8lib.c $ -** Standard library for UTF-8 manipulation -** See Copyright Notice in lua.h -*/ +// PUC Lua UTF-8 library, with minor modifications for integration in Luanti. +// Taken from https://github.com/lua/lua/blob/c15543b9afa31ab5dc564511ae11acd808405e8f/lutf8lib.c +// MIT-licensed, see LICENSE.txt #define lutf8lib_c #define LUA_LIB -#include "lprefix.h" - +#include "lutf8.h" #include #include #include #include +#include #include "lua.h" #include "lauxlib.h" -#include "lualib.h" -#include "llimits.h" #define MAXUNICODE 0x10FFFFu @@ -34,6 +30,9 @@ #define iscontp(p) iscont(*(p)) +typedef uint32_t l_uint32; +typedef uint64_t lua_Unsigned; + /* from strlib */ /* translate a relative string position: negative means back from end */ static lua_Integer u_posrelat (lua_Integer pos, size_t len) { @@ -99,11 +98,11 @@ static int utflen (lua_State *L) { while (posi <= posj) { const char *s1 = utf8_decode(s + posi, NULL, !lax); if (s1 == NULL) { /* conversion error? */ - luaL_pushfail(L); /* return fail ... */ + lua_pushnil(L); /* return fail ... */ lua_pushinteger(L, posi + 1); /* ... and current position */ return 2; } - posi = ct_diff2S(s1 - s); + posi = (size_t)(s1 - s); n++; } lua_pushinteger(L, n); @@ -137,17 +136,37 @@ static int codepoint (lua_State *L) { s = utf8_decode(s, &code, !lax); if (s == NULL) return luaL_error(L, MSGInvalid); - lua_pushinteger(L, l_castU2S(code)); + lua_pushinteger(L, code); n++; } return n; } +#define UTF8BUFFSZ 8 + +// Taken from https://github.com/lua/lua/blob/c15543b9afa31ab5dc564511ae11acd808405e8f/lobject.c#L385-L400 +static int luaO_utf8esc(char *buff, l_uint32 x) { + int n = 1; /* number of bytes put in buffer (backwards) */ + if (x < 0x80) /* ascii? */ + buff[UTF8BUFFSZ - 1] = (char)(x); + else { /* need continuation bytes */ + unsigned int mfb = 0x3f; /* maximum that fits in first byte */ + do { /* add continuation bytes */ + buff[UTF8BUFFSZ - (n++)] = (char)(0x80 | (x & 0x3f)); + x >>= 6; /* remove added bits */ + mfb >>= 1; /* now there is one less bit available in first byte */ + } while (x > mfb); /* still needs continuation byte? */ + buff[UTF8BUFFSZ - n] = (char)((~mfb << 1) | x); /* add first byte */ + } + return n; +} static void pushutfchar (lua_State *L, int arg) { lua_Unsigned code = (lua_Unsigned)luaL_checkinteger(L, arg); luaL_argcheck(L, code <= MAXUTF, arg, "value out of range"); - lua_pushfstring(L, "%U", (long)code); + char bf[UTF8BUFFSZ]; + int len = luaO_utf8esc(bf, (l_uint32)code); + lua_pushlstring(L, &bf[UTF8BUFFSZ - len], len); } @@ -180,7 +199,7 @@ static int byteoffset (lua_State *L) { size_t len; const char *s = luaL_checklstring(L, 1, &len); lua_Integer n = luaL_checkinteger(L, 2); - lua_Integer posi = (n >= 0) ? 1 : cast_st2S(len) + 1; + lua_Integer posi = (n >= 0) ? 1 : (lua_Integer)(len + 1); posi = u_posrelat(luaL_optinteger(L, 3, posi), len); luaL_argcheck(L, 1 <= posi && --posi <= (lua_Integer)len, 3, "position out of bounds"); @@ -210,7 +229,7 @@ static int byteoffset (lua_State *L) { } } if (n != 0) { /* did not find given character? */ - luaL_pushfail(L); + lua_pushnil(L); return 1; } lua_pushinteger(L, posi + 1); /* initial position */ @@ -239,8 +258,8 @@ static int iter_aux (lua_State *L, int strict) { const char *next = utf8_decode(s + n, &code, strict); if (next == NULL || iscontp(next)) return luaL_error(L, MSGInvalid); - lua_pushinteger(L, l_castU2S(n + 1)); - lua_pushinteger(L, l_castU2S(code)); + lua_pushinteger(L, (lua_Integer)(n + 1)); + lua_pushinteger(L, (lua_Integer)code); return 2; } } @@ -267,7 +286,7 @@ static int iter_codes (lua_State *L) { /* pattern to match a single UTF-8 character */ -#define UTF8PATT "[\0-\x7F\xC2-\xFD][\x80-\xBF]*" +#define UTF8PATT "[%z-\x7F\xC2-\xFD][\x80-\xBF]*" static const luaL_Reg funcs[] = { @@ -276,14 +295,12 @@ static const luaL_Reg funcs[] = { {"char", utfchar}, {"len", utflen}, {"codes", iter_codes}, - /* placeholders */ - {"charpattern", NULL}, {NULL, NULL} }; -LUAMOD_API int luaopen_utf8 (lua_State *L) { - luaL_newlib(L, funcs); +LUALIB_API int luaopen_utf8 (lua_State *L) { + luaL_register(L, LUA_UTF8LIBNAME, funcs); lua_pushlstring(L, UTF8PATT, sizeof(UTF8PATT)/sizeof(char) - 1); lua_setfield(L, -2, "charpattern"); return 1; diff --git a/lib/lutf8/lutf8.h b/lib/lutf8/lutf8.h new file mode 100644 index 000000000..5cff503e4 --- /dev/null +++ b/lib/lutf8/lutf8.h @@ -0,0 +1,6 @@ +#pragma once + +#include "lua.h" + +#define LUA_UTF8LIBNAME "utf8" +LUALIB_API int luaopen_utf8(lua_State *L); diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 1231f49ba..446209728 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -573,6 +573,7 @@ include_directories(SYSTEM ${GMP_INCLUDE_DIR} ${JSON_INCLUDE_DIR} ${LUA_BIT_INCLUDE_DIR} + ${LUA_UTF8_INCLUDE_DIR} # on Android, Luanti depends on SDL2 directly # on other platforms, only IrrlichtMt depends on SDL2 "$<$:${SDL2_INCLUDE_DIRS}>" @@ -695,6 +696,7 @@ if(BUILD_CLIENT) ${GMP_LIBRARY} ${JSON_LIBRARY} ${LUA_BIT_LIBRARY} + ${LUA_UTF8_LIBRARY} sha256 ${FREETYPE_LIBRARY} ${PLATFORM_LIBS} @@ -785,6 +787,7 @@ if(BUILD_SERVER) ${JSON_LIBRARY} ${LUA_LIBRARY} ${LUA_BIT_LIBRARY} + ${LUA_UTF8_LIBRARY} sha256 ${GMP_LIBRARY} ${PLATFORM_LIBS} diff --git a/src/script/cpp_api/s_base.cpp b/src/script/cpp_api/s_base.cpp index ba931b22a..14fea146c 100644 --- a/src/script/cpp_api/s_base.cpp +++ b/src/script/cpp_api/s_base.cpp @@ -29,6 +29,7 @@ extern "C" { #else #include "bit.h" #endif +#include "lutf8.h" } #include @@ -84,6 +85,10 @@ ScriptApiBase::ScriptApiBase(ScriptingType type): lua_pushstring(m_luastack, LUA_BITLIBNAME); lua_call(m_luastack, 1, 0); + // Load utf8 library + lua_pushcfunction(m_luastack, luaopen_utf8); + lua_call(m_luastack, 0, 0); + #if BUILD_WITH_TRACY // Load tracy lua bindings tracy::LuaRegister(m_luastack); diff --git a/src/script/cpp_api/s_security.cpp b/src/script/cpp_api/s_security.cpp index 834650fdc..54e9f9ea9 100644 --- a/src/script/cpp_api/s_security.cpp +++ b/src/script/cpp_api/s_security.cpp @@ -3,6 +3,7 @@ // Copyright (C) 2013 celeron55, Perttu Ahola #include "cpp_api/s_security.h" +#include "log.h" #include "lua_api/l_base.h" #include "filesys.h" #include "porting.h" @@ -38,6 +39,7 @@ static void shallow_copy_table(lua_State *L, int from=-2, int to=-1) if (from < 0) from = lua_gettop(L) + from + 1; if (to < 0) to = lua_gettop(L) + to + 1; lua_pushnil(L); + assert(lua_istable(L, from)); while (lua_next(L, from) != 0) { assert(lua_type(L, -1) != LUA_TTABLE); // duplicate key and value for lua_rawset @@ -96,6 +98,7 @@ void ScriptApiSecurity::initializeSecurity() "table", "math", "bit", + "utf8", // Not sure if completely safe. But if someone enables tracy, they'll // know what they do. #if BUILD_WITH_TRACY @@ -296,6 +299,7 @@ void ScriptApiSecurity::initializeSecurityClient() "table", "math", "bit", + "utf8", // Not sure if completely safe. But if someone enables tracy, they'll // know what they do. #if BUILD_WITH_TRACY