HEX
Server: Apache/2.4.41 (Ubuntu)
System: Linux ip-172-31-42-149 5.15.0-1084-aws #91~20.04.1-Ubuntu SMP Fri May 2 07:00:04 UTC 2025 aarch64
User: ubuntu (1000)
PHP: 7.4.33
Disabled: pcntl_alarm,pcntl_fork,pcntl_waitpid,pcntl_wait,pcntl_wifexited,pcntl_wifstopped,pcntl_wifsignaled,pcntl_wifcontinued,pcntl_wexitstatus,pcntl_wtermsig,pcntl_wstopsig,pcntl_signal,pcntl_signal_get_handler,pcntl_signal_dispatch,pcntl_get_last_error,pcntl_strerror,pcntl_sigprocmask,pcntl_sigwaitinfo,pcntl_sigtimedwait,pcntl_exec,pcntl_getpriority,pcntl_setpriority,pcntl_async_signals,pcntl_unshare,
Upload Files
File: //home/ubuntu/neovim/src/nvim/generators/gen_unicode_tables.lua
-- Script creates the following tables in unicode_tables.generated.h:
--
-- 1. doublewidth and ambiguous tables: sorted list of non-overlapping closed
--    intervals. Codepoints in these intervals have double (W or F) or ambiguous
--    (A) east asian width respectively.
-- 2. combining table: same as the above, but characters inside are combining
--    characters (i.e. have general categories equal to Mn, Mc or Me).
-- 3. foldCase table used to convert characters to
--    folded variants. In this table first two values are
--    character ranges: like in previous tables they are sorted and must be
--    non-overlapping. Third value means step inside the range: e.g. if it is
--    2 then interval applies only to first, third, fifth, … character in range.
--    Fourth value is number that should be added to the codepoint to yield
--    folded codepoint.
-- 4. emoji_wide and emoji_all tables: sorted lists of non-overlapping closed
--    intervals of Emoji characters.  emoji_wide contains all the characters
--    which don't have ambiguous or double width, and emoji_all has all Emojis.
if arg[1] == '--help' then
  print('Usage:')
  print('  gen_unicode_tables.lua unicode/ unicode_tables.generated.h')
  os.exit(0)
end

local basedir = arg[1]
local pathsep = package.config:sub(1, 1)
local get_path = function(fname)
  return basedir .. pathsep .. fname
end

local unicodedata_fname = get_path('UnicodeData.txt')
local eastasianwidth_fname = get_path('EastAsianWidth.txt')
local emoji_fname = get_path('emoji-data.txt')

local utf_tables_fname = arg[2]

local split_on_semicolons = function(s)
  local ret = {}
  local idx = 1
  while idx <= #s + 1 do
    local item = s:match('^[^;]*', idx)
    idx = idx + #item + 1
    if idx <= #s + 1 then
      assert(s:sub(idx - 1, idx - 1) == ';')
    end
    item = item:gsub('^%s*', '')
    item = item:gsub('%s*$', '')
    table.insert(ret, item)
  end
  return ret
end

local fp_lines_to_lists = function(fp, n, has_comments)
  local ret = {}
  local line
  local i = 0
  while true do
    i = i + 1
    line = fp:read('*l')
    if not line then
      break
    end
    if not has_comments or (line:sub(1, 1) ~= '#' and not line:match('^%s*$')) then
      local l = split_on_semicolons(line)
      if #l ~= n then
        io.stderr:write(('Found %s items in line %u, expected %u\n'):format(#l, i, n))
        io.stderr:write('Line: ' .. line .. '\n')
        return nil
      end
      table.insert(ret, l)
    end
  end
  return ret
end

local parse_data_to_props = function(ud_fp)
  return fp_lines_to_lists(ud_fp, 15, false)
end

local parse_width_props = function(eaw_fp)
  return fp_lines_to_lists(eaw_fp, 2, true)
end

local parse_emoji_props = function(emoji_fp)
  return fp_lines_to_lists(emoji_fp, 2, true)
end

local make_range = function(start, end_, step, add)
  if step and add then
    return ('  {0x%x, 0x%x, %d, %d},\n'):format(start, end_, step == 0 and -1 or step, add)
  else
    return ('  {0x%04x, 0x%04x},\n'):format(start, end_)
  end
end

local build_combining_table = function(ut_fp, dataprops)
  ut_fp:write('static const struct interval combining[] = {\n')
  local start = -1
  local end_ = -1
  for _, p in ipairs(dataprops) do
    -- The 'Mc' property was removed, it does take up space.
    if ({ Mn = true, Me = true })[p[3]] then
      local n = tonumber(p[1], 16)
      if start >= 0 and end_ + 1 == n then
        -- Continue with the same range.
        end_ = n
      else
        if start >= 0 then
          -- Produce previous range.
          ut_fp:write(make_range(start, end_))
        end
        start = n
        end_ = n
      end
    end
  end
  if start >= 0 then
    ut_fp:write(make_range(start, end_))
  end
  ut_fp:write('};\n')
end

local build_width_table = function(ut_fp, dataprops, widthprops, widths, table_name)
  ut_fp:write('static const struct interval ' .. table_name .. '[] = {\n')
  local start = -1
  local end_ = -1
  local dataidx = 1
  local ret = {}
  for _, p in ipairs(widthprops) do
    if widths[p[2]:sub(1, 1)] then
      local rng_start, rng_end = p[1]:find('%.%.')
      local n, n_last
      if rng_start then
        -- It is a range. We don’t check for composing char then.
        n = tonumber(p[1]:sub(1, rng_start - 1), 16)
        n_last = tonumber(p[1]:sub(rng_end + 1), 16)
      else
        n = tonumber(p[1], 16)
        n_last = n
      end
      local dn
      while true do
        dn = tonumber(dataprops[dataidx][1], 16)
        if dn >= n then
          break
        end
        dataidx = dataidx + 1
      end
      if dn ~= n and n_last == n then
        io.stderr:write('Cannot find character ' .. n .. ' in data table.\n')
      end
      -- Only use the char when it’s not a composing char.
      -- But use all chars from a range.
      local dp = dataprops[dataidx]
      if (n_last > n) or not ({ Mn = true, Mc = true, Me = true })[dp[3]] then
        if start >= 0 and end_ + 1 == n then -- luacheck: ignore 542
          -- Continue with the same range.
        else
          if start >= 0 then
            ut_fp:write(make_range(start, end_))
            table.insert(ret, { start, end_ })
          end
          start = n
        end
        end_ = n_last
      end
    end
  end
  if start >= 0 then
    ut_fp:write(make_range(start, end_))
    table.insert(ret, { start, end_ })
  end
  ut_fp:write('};\n')
  return ret
end

local build_emoji_table = function(ut_fp, emojiprops, doublewidth, ambiwidth)
  local emojiwidth = {}
  local emoji = {}
  for _, p in ipairs(emojiprops) do
    if p[2]:match('Emoji%s+#') then
      local rng_start, rng_end = p[1]:find('%.%.')
      local n
      local n_last
      if rng_start then
        n = tonumber(p[1]:sub(1, rng_start - 1), 16)
        n_last = tonumber(p[1]:sub(rng_end + 1), 16)
      else
        n = tonumber(p[1], 16)
        n_last = n
      end
      if #emoji > 0 and n - 1 == emoji[#emoji][2] then
        emoji[#emoji][2] = n_last
      else
        table.insert(emoji, { n, n_last })
      end

      -- Characters below 1F000 may be considered single width traditionally,
      -- making them double width causes problems.
      if n >= 0x1f000 then
        -- exclude characters that are in the ambiguous/doublewidth table
        for _, ambi in ipairs(ambiwidth) do
          if n >= ambi[1] and n <= ambi[2] then
            n = ambi[2] + 1
          end
          if n_last >= ambi[1] and n_last <= ambi[2] then
            n_last = ambi[1] - 1
          end
        end
        for _, double in ipairs(doublewidth) do
          if n >= double[1] and n <= double[2] then
            n = double[2] + 1
          end
          if n_last >= double[1] and n_last <= double[2] then
            n_last = double[1] - 1
          end
        end

        if n <= n_last then
          if #emojiwidth > 0 and n - 1 == emojiwidth[#emojiwidth][2] then
            emojiwidth[#emojiwidth][2] = n_last
          else
            table.insert(emojiwidth, { n, n_last })
          end
        end
      end
    end
  end

  ut_fp:write('static const struct interval emoji_all[] = {\n')
  for _, p in ipairs(emoji) do
    ut_fp:write(make_range(p[1], p[2]))
  end
  ut_fp:write('};\n')

  ut_fp:write('static const struct interval emoji_wide[] = {\n')
  for _, p in ipairs(emojiwidth) do
    ut_fp:write(make_range(p[1], p[2]))
  end
  ut_fp:write('};\n')
end

local ud_fp = io.open(unicodedata_fname, 'r')
local dataprops = parse_data_to_props(ud_fp)
ud_fp:close()

local ut_fp = io.open(utf_tables_fname, 'w')

build_combining_table(ut_fp, dataprops)

local eaw_fp = io.open(eastasianwidth_fname, 'r')
local widthprops = parse_width_props(eaw_fp)
eaw_fp:close()

local doublewidth =
  build_width_table(ut_fp, dataprops, widthprops, { W = true, F = true }, 'doublewidth')
local ambiwidth = build_width_table(ut_fp, dataprops, widthprops, { A = true }, 'ambiguous')

local emoji_fp = io.open(emoji_fname, 'r')
local emojiprops = parse_emoji_props(emoji_fp)
emoji_fp:close()

build_emoji_table(ut_fp, emojiprops, doublewidth, ambiwidth)

ut_fp:close()