Lua – 表的实现机制
表的实现机制
1. 表的结构
// Table的数据结构
// lobject.h
typedef struct Table {
CommonHeader; //宏:定义需要进行垃圾回收处理的Lua数据类型
lu_byte flags; //标识这个表有哪些元方法
lu_byte lsizenode; //数值是以2为底的对数值,并不是实际大小
unsigned int sizearray; //数组的大小
TValue *array; //指向数组部分的指针
Node *node; //指向哈希表头部分的指针
Node *lastfree; //指向哈希表尾部分的指针
struct Table *metatable;//指向该表的元表的指针
GCObject *gclist; //GC相关链表的指针
} Table;
从数据结构可以看到,Lua中table的储存分为数组和哈希表部分。
数组部分保存的是非负整数的键,实际大小是n(原则是数组中1-n的利用率超过50%)。数组部分从1开始索引,可以提供紧凑高效的随机访问。
哈希表部分存放不能被储存到数组部分的数据,唯一的限制是key不能为nil。
2. 表的新建与释
//ltable.c
Table *luaH_new (lua_State *L) {
GCObject *o = luaC_newobj(L, LUA_TTABLE, sizeof(Table));
Table *t = gco2t(o);
t->metatable = NULL;
t->flags = cast_byte(~0);
t->array = NULL;
t->sizearray = 0;
setnodevector(L, t, 0);
return t;
}
void luaH_free (lua_State *L, Table *t) {
if (!isdummy(t))
luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
luaM_freearray(L, t->array, t->sizearray);
luaM_free(L, t);
}
flags:最开始这个flags是1,在查找过某个元方法后,如果存在就将这个元方法对应的位置为0,下次直接比较就行。
isdummy(t):为了减少空表的维护成本,Lua定义了一个静态常量表,在初始化的时候让node指向这个dummynode。
//ltable.c
#define dummynode (&dummynode_)
static const Node dummynode_ = {
{NILCONSTANT}, /* value */
{{NILCONSTANT, 0}} /* key */
};
判断哈希表是否为空:
//ltable.h
#define isdummy(t) ((t)->lastfree == NULL)
3. 关于读写
3.1 读
根据键的类型有不同的查询函数。如果是正整数类型,若大小在数组size内,则返回数组对应值;否则计算出这个键对应的哈希值,根据哈希值找到散列桶位置,遍历该散列桶下的所有对象,直到找到相同的键再返回值。
//ltable.c
/*
** main search function
*/
const TValue *luaH_get (Table *t, const TValue *key) {
switch (ttype(key)) {
case LUA_TSHRSTR: return luaH_getshortstr(t, tsvalue(key));
case LUA_TNUMINT: return luaH_getint(t, ivalue(key));
case LUA_TNIL: return luaO_nilobject;
case LUA_TNUMFLT: {
lua_Integer k;
if (luaV_tointeger(key, &k, 0)) /* index is int? */
return luaH_getint(t, k); /* use specialized version */
/* else... */
} /* FALLTHROUGH */
default:
return getgeneric(t, key);
}
}
/*
** search function for integers
*/
const TValue *luaH_getint (Table *t, lua_Integer key) {
/* (1 <= key && key <= t->sizearray) */
if (l_castS2U(key) - 1 < t->sizearray)
return &t->array[key - 1];
else {
Node *n = hashint(t, key);
for (;;) { /* check whether 'key' is somewhere in the chain */
if (ttisinteger(gkey(n)) && ivalue(gkey(n)) == key)
return gval(n); /* that's it */
else {
int nx = gnext(n);
if (nx == 0) break;
n += nx;
}
}
return luaO_nilobject;
}
}
/*
** "Generic" get version. (Not that generic: not valid for integers,
** which may be in array part, nor for floats with integral values.)
*/
static const TValue *getgeneric (Table *t, const TValue *key) {
Node *n = mainposition(t, key);
for (;;) { /* check whether 'key' is somewhere in the chain */
if (luaV_rawequalobj(gkey(n), key))
return gval(n); /* that's it */
else {
int nx = gnext(n);
if (nx == 0)
return luaO_nilobject; /* not found */
n += nx;
}
}
}
3.2 写
先查询是否已有该键,如果有的话直接赋值;如果没有的话,则创建新的键,再赋值。创建新键的过程中可能会有重新分配数组和哈希表的操作。
//ltable.c
TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
const TValue *p = luaH_get(t, key);
if (p != luaO_nilobject)
return cast(TValue *, p);
else return luaH_newkey(L, t, key);
}
3.2.1 关于新建一个键
Lua的哈希表部分从根本上来说是由一组链表组成的。每个链表可以看做是一个桶,将所有的元素通过散列的方式放到不同的桶中。插入元素时,先将键传入一个哈希函数,得到这个键对应的桶的位置(mainposition,相同mainposition的数据以链表的形式组织),如果这mainposition没有被占,则可以直接赋值,否则要找到一个空闲节点,将其插入链表中。
//ltable.c
TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
Node *mp;
TValue aux;
if (ttisnil(key)) luaG_runerror(L, "table index is nil");
else if (ttisfloat(key)) {
lua_Integer k;
if (luaV_tointeger(key, &k, 0)) { /* does index fit in an integer? */
setivalue(&aux, k);
key = &aux; /* insert it as an integer */
}
else if (luai_numisnan(fltvalue(key)))
luaG_runerror(L, "table index is NaN");
}
mp = mainposition(t, key);
if (!ttisnil(gval(mp)) || isdummy(t)) { /* main position is taken? */
Node *othern;
Node *f = getfreepos(t); /* get a free place */
if (f == NULL) { /* cannot find a free place? */
rehash(L, t, key); /* grow table */
/* whatever called 'newkey' takes care of TM cache */
return luaH_set(L, t, key); /* insert key into grown table */
}
lua_assert(!isdummy(t));
othern = mainposition(t, gkey(mp));
if (othern != mp) { /* is colliding node out of its main position? */
/* yes; move colliding node into free position */
while (othern + gnext(othern) != mp) /* find previous */
othern += gnext(othern);
gnext(othern) = cast_int(f - othern); /* rechain to point to 'f' */
*f = *mp; /* copy colliding node into free pos. (mp->next also goes) */
if (gnext(mp) != 0) {
gnext(f) += cast_int(mp - f); /* correct 'next' */
gnext(mp) = 0; /* now 'mp' is free */
}
setnilvalue(gval(mp));
}
else { /* colliding node is in its own main position */
/* new node will go into free position */
if (gnext(mp) != 0)
gnext(f) = cast_int((mp + gnext(mp)) - f); /* chain new position */
else lua_assert(gnext(f) == 0);
gnext(mp) = cast_int(f - mp);
mp = f;
}
}
setnodekey(L, &mp->i_key, key);
luaC_barrierback(L, t, key);
lua_assert(ttisnil(gval(mp)));
return gval(mp);
}
3.2.2 关于mainposition
对于不同类型的key有不同的哈希函数,保证均匀散列,哈希表的效率高一些。
3.2.3 关于getfreepos
当mainpostion被占时,需要在哈希表中找到一个空闲可用的节点,是通过递减lastfree域来实现的。然后lua在设置键位的值为nil时,也不会立即回收节点,而是在lastfree域递减到头时,做一次rehash操作。
//ltable.c
static Node *getfreepos (Table *t) {
if (!isdummy(t)) {
while (t->lastfree > t->node) {
t->lastfree--;
if (ttisnil(gkey(t->lastfree)))
return t->lastfree;
}
}
return NULL; /* could not find a free place */
}
3.2.4 关于rehash
当newkey时发现没有空闲节点可用了,就会触发rehash操作。rehash做的事情主要是统计数组和哈希表中的有效键值对,重新分配数组和哈希表的空间大小。
原则是:数组部分的利用率要超过50%
//ltable.c
/*
** nums[i] = number of keys 'k' where 2^(i - 1) < k <= 2^i
*/
static void rehash (lua_State *L, Table *t, const TValue *ek) {
unsigned int asize; /* optimal size for array part */
unsigned int na; /* number of keys in the array part */
unsigned int nums[MAXABITS + 1];
int i;
int totaluse;
for (i = 0; i <= MAXABITS; i++) nums[i] = 0; /* reset counts */
na = numusearray(t, nums); /* count keys in array part */
totaluse = na; /* all those keys are integer keys */
totaluse += numusehash(t, nums, &na); /* count keys in hash part */
/* count extra key */
na += countint(ek, nums);
totaluse++;
/* compute new size for array part */
asize = computesizes(nums, &na);
/* resize the table to new computed sizes */
luaH_resize(L, t, asize, totaluse - na);
}
nums这个数组用来统计整数键值对。num[i]记录了从2^(i – 1)到2^i的键值对的数量。
统计完数组和哈希表中的整数键值对之后,会调用computesizes函数计算新的数组大小。
最后再调用resize函数重新分配数组和哈希表大小,并把不能存在数组里的键值对放入哈希表中。
3.2.5 关于numusearray,numusehash,countint
//ltable.c
static unsigned int numusearray (const Table *t, unsigned int *nums) {
int lg;
unsigned int ttlg; /* 2^lg */
unsigned int ause = 0; /* summation of 'nums' */
unsigned int i = 1; /* count to traverse all array keys */
/* traverse each slice */
for (lg = 0, ttlg = 1; lg <= MAXABITS; lg++, ttlg *= 2) {
unsigned int lc = 0; /* counter */
unsigned int lim = ttlg;
if (lim > t->sizearray) {
lim = t->sizearray; /* adjust upper limit */
if (i > lim)
break; /* no more elements to count */
}
/* count elements in range (2^(lg - 1), 2^lg] */
for (; i <= lim; i++) {
if (!ttisnil(&t->array[i-1]))
lc++;
}
nums[lg] += lc;
ause += lc;
}
return ause;
}
//在nums数组中统计哈希表中正整数键的键值对
static int numusehash (const Table *t, unsigned int *nums, unsigned int *pna) {
int totaluse = 0; /* total number of elements */
int ause = 0; /* elements added to 'nums' (can go to array part) */
int i = sizenode(t);
while (i--) {
Node *n = &t->node[i];
if (!ttisnil(gval(n))) {
ause += countint(gkey(n), nums);
totaluse++;
}
}
*pna += ause;
return totaluse;
}
//判断键是否是正整数键
static int countint (const TValue *key, unsigned int *nums) {
unsigned int k = arrayindex(key);
if (k != 0) { /* is 'key' an appropriate array index? */
nums[luaO_ceillog2(k)]++; /* count as such */
return 1;
}
else
return 0;
}
3.2.6 关于computesizes
//ltable.c
static unsigned int computesizes (unsigned int nums[], unsigned int *pna) {
int i;
unsigned int twotoi; /* 2^i (candidate for optimal size) */
unsigned int a = 0; /* number of elements smaller than 2^i */
unsigned int na = 0; /* number of elements to go to array part */
unsigned int optimal = 0; /* optimal size for array part */
/* loop while keys can fill more than half of total size */
for (i = 0, twotoi = 1;
twotoi > 0 && *pna > twotoi / 2;
i++, twotoi *= 2) {
if (nums[i] > 0) {
a += nums[i];
if (a > twotoi/2) { /* more than half elements present? */
optimal = twotoi; /* optimal size (till now) */
na = a; /* all elements up to 'optimal' will go to array part */
}
}
}
lua_assert((optimal == 0 || optimal / 2 < na) && na <= optimal);
*pna = na;
return optimal;
}
遍历之前nums统计的所有键值对数量,看看每个nums[i]区间内的使用率是否大于50%,找到这个区间的最大值,对应的目前的键值对数量就是目前lua表数组部分的最优大小。
3.2.7 关于luaH_resize
//ltable.c
void luaH_resize (lua_State *L, Table *t, unsigned int nasize,unsigned int nhsize) {
unsigned int i;
int j;
AuxsetnodeT asn;
unsigned int oldasize = t->sizearray;
int oldhsize = allocsizenode(t);
Node *nold = t->node; /* save old hash ... */
if (nasize > oldasize) /* array part must grow? */
setarrayvector(L, t, nasize);
/* create new hash part with appropriate size */
asn.t = t; asn.nhsize = nhsize;
if (luaD_rawrunprotected(L, auxsetnode, &asn) != LUA_OK) { /* mem. error? */
setarrayvector(L, t, oldasize); /* array back to its original size */
luaD_throw(L, LUA_ERRMEM); /* rethrow memory error */
}
if (nasize < oldasize) { /* array part must shrink? */
t->sizearray = nasize;
/* re-insert elements from vanishing slice */
for (i=nasize; i<oldasize; i++) {
if (!ttisnil(&t->array[i]))
luaH_setint(L, t, i + 1, &t->array[i]);
}
/* shrink array */
luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
}
/* re-insert elements from hash part */
for (j = oldhsize - 1; j >= 0; j--) {
Node *old = nold + j;
if (!ttisnil(gval(old))) {
/* doesn't need barrier/invalidate cache, as entry was
already present in the table */
setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
}
}
if (oldhsize > 0) /* not the dummy node? */
luaM_freearray(L, nold, cast(size_t, oldhsize)); /* free old hash */
}
4. 关于取长度
lua中使用#符号对表进行取长度操作。
//ltable.c
/*
** Try to find a boundary in table 't'. A 'boundary' is an integer index
** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
*/
lua_Unsigned luaH_getn (Table *t) {
unsigned int j = t->sizearray;
if (j > 0 && ttisnil(&t->array[j - 1])) {
/* there is a boundary in the array part: (binary) search for it */
unsigned int i = 0;
while (j - i > 1) {
unsigned int m = (i+j)/2;
if (ttisnil(&t->array[m - 1])) j = m;
else i = m;
}
return i;
}
/* else must find a boundary in hash part */
else if (isdummy(t)) /* hash part is empty? */
return j; /* that is easy... */
else return unbound_search(t, j);
}
先判断数组部分最后一个是否为nil,如果是nil,则使用二分法在数组部分寻找下一个为nil 的节点,确认长度;否则先判断哈希表部分是否为空,若为空则长度为数组部分;否则在哈希表中寻找边界,也是二分法。
//ltable.c
static lua_Unsigned unbound_search (Table *t, lua_Unsigned j) {
lua_Unsigned i = j; /* i is zero or a present index */
j++;
/* find 'i' and 'j' such that i is present and j is not */
while (!ttisnil(luaH_getint(t, j))) {
i = j;
if (j > l_castS2U(LUA_MAXINTEGER) / 2) { /* overflow? */
/* table was built with bad purposes: resort to linear search */
i = 1;
while (!ttisnil(luaH_getint(t, i))) i++;
return i - 1;
}
j *= 2;
}
/* now do a binary search between them */
while (j - i > 1) {
lua_Unsigned m = (i+j)/2;
if (ttisnil(luaH_getint(t, m))) j = m;
else i = m;
}
return i;
}
5. 关于迭代
lua表为了兼容数组和哈希表部分的访问,迭代操作传入的是一个key,返回这个key的下一个数据。对外的API是luaH_next函数。
//ltable.c
int luaH_next (lua_State *L, Table *t, StkId key) {
unsigned int i = findindex(L, t, key); /* find original element */
for (; i < t->sizearray; i++) { /* try first array part */
if (!ttisnil(&t->array[i])) { /* a non-nil value? */
setivalue(key, i + 1);
setobj2s(L, key+1, &t->array[i]);
return 1;
}
}
for (i -= t->sizearray; cast_int(i) < sizenode(t); i++) { /* hash part */
if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */
setobj2s(L, key, gkey(gnode(t, i)));
setobj2s(L, key+1, gval(gnode(t, i)));
return 1;
}
}
return 0; /* no more elements */
}
7. 注意事项
- lua表的rehash操作代价很大,所以要尽量避免这个操作,可以进行预分配来规避问题,提升效率。
local array = {}
array[1] = 1
array[2] = 2
array[3] = 3
//如果需要创建很多小表的时候,上面操作可以换成,减少解释器rehash
local array = {1,2,3}
array[1] = 1
array[2] = 2
array[3] = 3
- 尽量不要混用数组和哈希表部分,一个table只管理一类型的数据。
8. 参考
- 数据结构 哈希表(HashTable)_哈希概述表
- 《Lua设计与实现》
- 《Lua源码欣赏》云风