Lua – 表的实现机制

表的实现机制
1. 表的结构
// Table的数据结构
// lobject.h
typedef struct Table {
    CommonHeader;           //宏:定义需要进行垃圾回收处理的Lua数据类型
    lu_byte flags;          //标识这个表有哪些元方法
    lu_byte lsizenode;      //数值是以2为底的对数值,并不是实际大小
    unsigned int sizearray; //数组的大小
    TValue *array;          //指向数组部分的指针
    Node *node;             //指向哈希表头部分的指针
    Node *lastfree;         //指向哈希表尾部分的指针
    struct Table *metatable;//指向该表的元表的指针
    GCObject *gclist;       //GC相关链表的指针
} Table;

从数据结构可以看到,Lua中table的储存分为数组和哈希表部分。

数组部分保存的是非负整数的键,实际大小是n(原则是数组中1-n的利用率超过50%)。数组部分从1开始索引,可以提供紧凑高效的随机访问。
哈希表部分存放不能被储存到数组部分的数据,唯一的限制是key不能为nil。

2. 表的新建与释
//ltable.c
Table *luaH_new (lua_State *L) {
    GCObject *o = luaC_newobj(L, LUA_TTABLE, sizeof(Table));
    Table *t = gco2t(o);
    t->metatable = NULL;
    t->flags = cast_byte(~0);
    t->array = NULL;
    t->sizearray = 0;
    setnodevector(L, t, 0);
    return t;
}


void luaH_free (lua_State *L, Table *t) {
    if (!isdummy(t))
        luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
    luaM_freearray(L, t->array, t->sizearray);
    luaM_free(L, t);
}

flags:最开始这个flags是1,在查找过某个元方法后,如果存在就将这个元方法对应的位置为0,下次直接比较就行。

isdummy(t):为了减少空表的维护成本,Lua定义了一个静态常量表,在初始化的时候让node指向这个dummynode。

//ltable.c
#define dummynode       (&dummynode_)

static const Node dummynode_ = {
    {NILCONSTANT},    /* value */
    {{NILCONSTANT, 0}}    /* key */
};

判断哈希表是否为空:

//ltable.h
#define isdummy(t)      ((t)->lastfree == NULL)
3. 关于读写

3.1 读

根据键的类型有不同的查询函数。如果是正整数类型,若大小在数组size内,则返回数组对应值;否则计算出这个键对应的哈希值,根据哈希值找到散列桶位置,遍历该散列桶下的所有对象,直到找到相同的键再返回值。

//ltable.c
/*
** main search function
*/
const TValue *luaH_get (Table *t, const TValue *key) {
    switch (ttype(key)) {
        case LUA_TSHRSTR: return luaH_getshortstr(t, tsvalue(key));
        case LUA_TNUMINT: return luaH_getint(t, ivalue(key));
        case LUA_TNIL: return luaO_nilobject;
        case LUA_TNUMFLT: {
            lua_Integer k;
            if (luaV_tointeger(key, &k, 0)) /* index is int? */
                return luaH_getint(t, k);    /* use specialized version */
            /* else... */
        }    /* FALLTHROUGH */
        default:
            return getgeneric(t, key);
    }
}

/*
** search function for integers
*/
const TValue *luaH_getint (Table *t, lua_Integer key) {
    /* (1 <= key && key <= t->sizearray) */
    if (l_castS2U(key) - 1 < t->sizearray)
        return &t->array[key - 1];
    else {
        Node *n = hashint(t, key);
        for (;;) {    /* check whether 'key' is somewhere in the chain */
            if (ttisinteger(gkey(n)) && ivalue(gkey(n)) == key)
                return gval(n);    /* that's it */
            else {
                int nx = gnext(n);
                if (nx == 0) break;
                n += nx;
            }
        }
        return luaO_nilobject;
    }
}

/*
** "Generic" get version. (Not that generic: not valid for integers,
** which may be in array part, nor for floats with integral values.)
*/
static const TValue *getgeneric (Table *t, const TValue *key) {
    Node *n = mainposition(t, key);
    for (;;) {    /* check whether 'key' is somewhere in the chain */
        if (luaV_rawequalobj(gkey(n), key))
            return gval(n);    /* that's it */
        else {
            int nx = gnext(n);
            if (nx == 0)
                return luaO_nilobject;    /* not found */
            n += nx;
        }
    }
}

3.2 写

先查询是否已有该键,如果有的话直接赋值;如果没有的话,则创建新的键,再赋值。创建新键的过程中可能会有重新分配数组和哈希表的操作。

//ltable.c
TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
    const TValue *p = luaH_get(t, key);
    if (p != luaO_nilobject)
        return cast(TValue *, p);
    else return luaH_newkey(L, t, key);
}

3.2.1 关于新建一个键

Lua的哈希表部分从根本上来说是由一组链表组成的。每个链表可以看做是一个桶,将所有的元素通过散列的方式放到不同的桶中。插入元素时,先将键传入一个哈希函数,得到这个键对应的桶的位置(mainposition,相同mainposition的数据以链表的形式组织),如果这mainposition没有被占,则可以直接赋值,否则要找到一个空闲节点,将其插入链表中。

//ltable.c
TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
    Node *mp;
    TValue aux;
    if (ttisnil(key)) luaG_runerror(L, "table index is nil");
    else if (ttisfloat(key)) {
        lua_Integer k;
        if (luaV_tointeger(key, &k, 0)) {    /* does index fit in an integer? */
            setivalue(&aux, k);
            key = &aux;    /* insert it as an integer */
        }
        else if (luai_numisnan(fltvalue(key)))
            luaG_runerror(L, "table index is NaN");
    }
    mp = mainposition(t, key);
    if (!ttisnil(gval(mp)) || isdummy(t)) {    /* main position is taken? */
        Node *othern;
        Node *f = getfreepos(t);    /* get a free place */
        if (f == NULL) {    /* cannot find a free place? */
            rehash(L, t, key);    /* grow table */
            /* whatever called 'newkey' takes care of TM cache */
            return luaH_set(L, t, key);    /* insert key into grown table */
        }
        lua_assert(!isdummy(t));
        othern = mainposition(t, gkey(mp));
        if (othern != mp) {    /* is colliding node out of its main position? */
            /* yes; move colliding node into free position */
            while (othern + gnext(othern) != mp)    /* find previous */
                othern += gnext(othern);
            gnext(othern) = cast_int(f - othern);    /* rechain to point to 'f' */
            *f = *mp;    /* copy colliding node into free pos. (mp->next also goes) */
            if (gnext(mp) != 0) {
                gnext(f) += cast_int(mp - f);    /* correct 'next' */
                gnext(mp) = 0;    /* now 'mp' is free */
            }
            setnilvalue(gval(mp));
        }
        else {    /* colliding node is in its own main position */
            /* new node will go into free position */
            if (gnext(mp) != 0)
                gnext(f) = cast_int((mp + gnext(mp)) - f);    /* chain new position */
            else lua_assert(gnext(f) == 0);
            gnext(mp) = cast_int(f - mp);
            mp = f;
        }
    }
    setnodekey(L, &mp->i_key, key);
    luaC_barrierback(L, t, key);
    lua_assert(ttisnil(gval(mp)));
    return gval(mp);
}

3.2.2 关于mainposition

对于不同类型的key有不同的哈希函数,保证均匀散列,哈希表的效率高一些。

3.2.3 关于getfreepos

当mainpostion被占时,需要在哈希表中找到一个空闲可用的节点,是通过递减lastfree域来实现的。然后lua在设置键位的值为nil时,也不会立即回收节点,而是在lastfree域递减到头时,做一次rehash操作。

//ltable.c
static Node *getfreepos (Table *t) {
    if (!isdummy(t)) {
        while (t->lastfree > t->node) {
            t->lastfree--;
            if (ttisnil(gkey(t->lastfree)))
                return t->lastfree;
        }
    }
    return NULL;    /* could not find a free place */
}

3.2.4 关于rehash

当newkey时发现没有空闲节点可用了,就会触发rehash操作。rehash做的事情主要是统计数组和哈希表中的有效键值对,重新分配数组和哈希表的空间大小。
原则是:数组部分的利用率要超过50%

//ltable.c
/*
** nums[i] = number of keys 'k' where 2^(i - 1) < k <= 2^i
*/
static void rehash (lua_State *L, Table *t, const TValue *ek) {
    unsigned int asize;    /* optimal size for array part */
    unsigned int na;    /* number of keys in the array part */
    unsigned int nums[MAXABITS + 1];
    int i;
    int totaluse;
    for (i = 0; i <= MAXABITS; i++) nums[i] = 0;    /* reset counts */
    na = numusearray(t, nums);    /* count keys in array part */
    totaluse = na;    /* all those keys are integer keys */
    totaluse += numusehash(t, nums, &na);    /* count keys in hash part */
    /* count extra key */
    na += countint(ek, nums);
    totaluse++;
    /* compute new size for array part */
    asize = computesizes(nums, &na);
    /* resize the table to new computed sizes */
    luaH_resize(L, t, asize, totaluse - na);
}

nums这个数组用来统计整数键值对。num[i]记录了从2^(i – 1)到2^i的键值对的数量。
统计完数组和哈希表中的整数键值对之后,会调用computesizes函数计算新的数组大小。
最后再调用resize函数重新分配数组和哈希表大小,并把不能存在数组里的键值对放入哈希表中。

3.2.5 关于numusearray,numusehash,countint

//ltable.c
static unsigned int numusearray (const Table *t, unsigned int *nums) {
    int lg;
    unsigned int ttlg;    /* 2^lg */
    unsigned int ause = 0;    /* summation of 'nums' */
    unsigned int i = 1;    /* count to traverse all array keys */
    /* traverse each slice */
    for (lg = 0, ttlg = 1; lg <= MAXABITS; lg++, ttlg *= 2) {
        unsigned int lc = 0;    /* counter */
        unsigned int lim = ttlg;
        if (lim > t->sizearray) {
            lim = t->sizearray;    /* adjust upper limit */
            if (i > lim)
                break;    /* no more elements to count */
        }
        /* count elements in range (2^(lg - 1), 2^lg] */
        for (; i <= lim; i++) {
            if (!ttisnil(&t->array[i-1]))
                lc++;
        }
        nums[lg] += lc;
        ause += lc;
    }
    return ause;
}

//在nums数组中统计哈希表中正整数键的键值对
static int numusehash (const Table *t, unsigned int *nums, unsigned int *pna) {
    int totaluse = 0;    /* total number of elements */
    int ause = 0;    /* elements added to 'nums' (can go to array part) */
    int i = sizenode(t);
    while (i--) {
        Node *n = &t->node[i];
        if (!ttisnil(gval(n))) {
            ause += countint(gkey(n), nums);
            totaluse++;
        }
    }
    *pna += ause;
    return totaluse;
}


//判断键是否是正整数键
static int countint (const TValue *key, unsigned int *nums) {
    unsigned int k = arrayindex(key);
    if (k != 0) {    /* is 'key' an appropriate array index? */
        nums[luaO_ceillog2(k)]++;    /* count as such */
        return 1;
    }
    else
        return 0;
}

3.2.6 关于computesizes

//ltable.c
static unsigned int computesizes (unsigned int nums[], unsigned int *pna) {
    int i;
    unsigned int twotoi;    /* 2^i (candidate for optimal size) */
    unsigned int a = 0;    /* number of elements smaller than 2^i */
    unsigned int na = 0;    /* number of elements to go to array part */
    unsigned int optimal = 0;    /* optimal size for array part */
    /* loop while keys can fill more than half of total size */
    for (i = 0, twotoi = 1;
             twotoi > 0 && *pna > twotoi / 2;
             i++, twotoi *= 2) {
        if (nums[i] > 0) {
            a += nums[i];
            if (a > twotoi/2) {    /* more than half elements present? */
                optimal = twotoi;    /* optimal size (till now) */
                na = a;    /* all elements up to 'optimal' will go to array part */
            }
        }
    }
    lua_assert((optimal == 0 || optimal / 2 < na) && na <= optimal);
    *pna = na;
    return optimal;
}

遍历之前nums统计的所有键值对数量,看看每个nums[i]区间内的使用率是否大于50%,找到这个区间的最大值,对应的目前的键值对数量就是目前lua表数组部分的最优大小。

3.2.7 关于luaH_resize

//ltable.c
void luaH_resize (lua_State *L, Table *t, unsigned int nasize,unsigned int nhsize) {
    unsigned int i;
    int j;
    AuxsetnodeT asn;
    unsigned int oldasize = t->sizearray;
    int oldhsize = allocsizenode(t);
    Node *nold = t->node;    /* save old hash ... */
    if (nasize > oldasize)    /* array part must grow? */
        setarrayvector(L, t, nasize);
    /* create new hash part with appropriate size */
    asn.t = t; asn.nhsize = nhsize;
    if (luaD_rawrunprotected(L, auxsetnode, &asn) != LUA_OK) {    /* mem. error? */
        setarrayvector(L, t, oldasize);    /* array back to its original size */
        luaD_throw(L, LUA_ERRMEM);    /* rethrow memory error */
    }
    if (nasize < oldasize) {    /* array part must shrink? */
        t->sizearray = nasize;
        /* re-insert elements from vanishing slice */
        for (i=nasize; i<oldasize; i++) {
            if (!ttisnil(&t->array[i]))
                luaH_setint(L, t, i + 1, &t->array[i]);
        }
        /* shrink array */
        luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
    }
    /* re-insert elements from hash part */
    for (j = oldhsize - 1; j >= 0; j--) {
        Node *old = nold + j;
        if (!ttisnil(gval(old))) {
            /* doesn't need barrier/invalidate cache, as entry was
                 already present in the table */
            setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
        }
    }
    if (oldhsize > 0)    /* not the dummy node? */
        luaM_freearray(L, nold, cast(size_t, oldhsize)); /* free old hash */
}
4. 关于取长度

lua中使用#符号对表进行取长度操作。

//ltable.c
/*
** Try to find a boundary in table 't'. A 'boundary' is an integer index
** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
*/
lua_Unsigned luaH_getn (Table *t) {
    unsigned int j = t->sizearray;
    if (j > 0 && ttisnil(&t->array[j - 1])) {
        /* there is a boundary in the array part: (binary) search for it */
        unsigned int i = 0;
        while (j - i > 1) {
            unsigned int m = (i+j)/2;
            if (ttisnil(&t->array[m - 1])) j = m;
            else i = m;
        }
        return i;
    }
    /* else must find a boundary in hash part */
    else if (isdummy(t))    /* hash part is empty? */
        return j;    /* that is easy... */
    else return unbound_search(t, j);
}

先判断数组部分最后一个是否为nil,如果是nil,则使用二分法在数组部分寻找下一个为nil 的节点,确认长度;否则先判断哈希表部分是否为空,若为空则长度为数组部分;否则在哈希表中寻找边界,也是二分法。

//ltable.c
static lua_Unsigned unbound_search (Table *t, lua_Unsigned j) {
    lua_Unsigned i = j;    /* i is zero or a present index */
    j++;
    /* find 'i' and 'j' such that i is present and j is not */
    while (!ttisnil(luaH_getint(t, j))) {
        i = j;
        if (j > l_castS2U(LUA_MAXINTEGER) / 2) {    /* overflow? */
            /* table was built with bad purposes: resort to linear search */
            i = 1;
            while (!ttisnil(luaH_getint(t, i))) i++;
            return i - 1;
        }
        j *= 2;
    }
    /* now do a binary search between them */
    while (j - i > 1) {
        lua_Unsigned m = (i+j)/2;
        if (ttisnil(luaH_getint(t, m))) j = m;
        else i = m;
    }
    return i;
}
5. 关于迭代

lua表为了兼容数组和哈希表部分的访问,迭代操作传入的是一个key,返回这个key的下一个数据。对外的API是luaH_next函数。

//ltable.c
int luaH_next (lua_State *L, Table *t, StkId key) {
    unsigned int i = findindex(L, t, key);    /* find original element */
    for (; i < t->sizearray; i++) {    /* try first array part */
        if (!ttisnil(&t->array[i])) {    /* a non-nil value? */
            setivalue(key, i + 1);
            setobj2s(L, key+1, &t->array[i]);
            return 1;
        }
    }
    for (i -= t->sizearray; cast_int(i) < sizenode(t); i++) {    /* hash part */
        if (!ttisnil(gval(gnode(t, i)))) {    /* a non-nil value? */
            setobj2s(L, key, gkey(gnode(t, i)));
            setobj2s(L, key+1, gval(gnode(t, i)));
            return 1;
        }
    }
    return 0;    /* no more elements */
}
7. 注意事项
  1. lua表的rehash操作代价很大,所以要尽量避免这个操作,可以进行预分配来规避问题,提升效率。
local array = {}
array[1] = 1
array[2] = 2
array[3] = 3

//如果需要创建很多小表的时候,上面操作可以换成,减少解释器rehash

local array = {1,2,3}
array[1] = 1
array[2] = 2
array[3] = 3

  1. 尽量不要混用数组和哈希表部分,一个table只管理一类型的数据。
8. 参考
  1. 数据结构 哈希表(HashTable)_哈希概述表
  2. 《Lua设计与实现》
  3. 《Lua源码欣赏》云风
Tags:

Add a Comment

您的电子邮箱地址不会被公开。 必填项已用*标注