This slide is based on the book of Mark Allen Weiss
张怀勇等译(第三版) 以及冯舜玺 译(第四版).
散列表的实现通常称为
散列是一种用于以常数平均时间执行插入、删除和查找的技术.
理想的散列表数据结构只不过是一个包含一些项的具有固定大小的数组.
表的大小记作
将每个键映射到从
理想情况下, 散列函数应该运算简单并且保证任何两个不同的键映射到不同的单元. 即是单射.
不过这是不可能的, 因为单元的数目是有限的, 而键实际上是用不完的.
因此, 我们寻找一个散列函数, 该函数要在单元之间均匀地分配键. 还要选择一个函数, 决定当两个键散列到同一个值的时候(称为
如果键是整数, 则一般合理的方法是
这种情形下, 散列函数需要仔细选择.
一种方法是将字符串中字符的
int hash( const string & key, int tableSize )
{
int hashVal = 0;
for( int i = 0; i < key.length( ); i++ )
hashVal += key[ i ];
return hashVal % tableSize;
}
不过, 如果表很大, 则函数就不会很好地分配键. 例如
由于
对于每个元素
平方探测就是上面线性探测函数中令
#ifndef SEPARATE_CHAINING_H
#define SEPARATE_CHAINING_H
#include <vector>
#include <list>
#include <string>
#include <algorithm>
using namespace std;
int nextPrime( int n );
// SeparateChaining Hash table class
//
// CONSTRUCTION: an approximate initial size or default of 101
//
// ******************PUBLIC OPERATIONS*********************
// bool insert( x ) --> Insert x
// bool remove( x ) --> Remove x
// bool contains( x ) --> Return true if x is present
// void makeEmpty( ) --> Remove all items
// int hash( string str ) --> Global method to hash strings
int hash( const string & key );
int hash( int key );
template <typename HashedObj>
class HashTable
{
public:
explicit HashTable( int size = 101 )
: currentSize( 0 )
{ theLists.resize( size ); }
bool contains( const HashedObj & x ) const
{
const list<HashedObj> & whichList = theLists[ myhash( x ) ];
return find( whichList.begin( ), whichList.end( ), x ) != whichList.end( );
}
void makeEmpty( )
{
for( int i = 0; i < theLists.size( ); i++ )
theLists[ i ].clear( );
}
bool insert( const HashedObj & x )
{
list<HashedObj> & whichList = theLists[ myhash( x ) ];
if( find( whichList.begin( ), whichList.end( ), x ) != whichList.end( ) )
return false;
whichList.push_back( x );
// Rehash; see Section 5.5
if( ++currentSize > theLists.size( ) )
rehash( );
return true;
}
bool remove( const HashedObj & x )
{
list<HashedObj> & whichList = theLists[ myhash( x ) ];
typename list<HashedObj>::iterator itr = find( whichList.begin( ), whichList.end( ), x );
if( itr == whichList.end( ) )
return false;
whichList.erase( itr );
--currentSize;
return true;
}
private:
vector<list<HashedObj> > theLists; // The array of Lists
int currentSize;
void rehash( )
{
vector<list<HashedObj> > oldLists = theLists;
// Create new double-sized, empty table
theLists.resize( nextPrime( 2 * theLists.size( ) ) );
for( unsigned int j = 0; j < theLists.size( ); j++ )
theLists[ j ].clear( );
// Copy table over
currentSize = 0;
for(unsigned int i = 0; i < oldLists.size( ); i++ )
{
typename list<HashedObj>::iterator itr = oldLists[ i ].begin( );
while( itr != oldLists[ i ].end( ) )
insert( *itr++ );
}
}
int myhash( const HashedObj & x ) const
{
int hashVal = hash( x );
hashVal %= theLists.size( );
if( hashVal < 0 )
hashVal += theLists.size( );
return hashVal;
}
};
#endif
#include "SeparateChaining.h"
#include <iostream>
using namespace std;
/**
* Internal method to test if a positive number is prime.
* Not an efficient algorithm.
*/
bool isPrime( int n )
{
if( n == 2 || n == 3 )
return true;
if( n == 1 || n % 2 == 0 )
return false;
for( int i = 3; i * i <= n; i += 2 )
if( n % i == 0 )
return false;
return true;
}
/**
* Internal method to return a prime number at least as large as n.
* Assumes n > 0.
*/
int nextPrime( int n )
{
if( n % 2 == 0 )
n++;
for( ; !isPrime( n ); n += 2 )
;
return n;
}
/**
* A hash routine for string objects.
*/
int hash( const string & key )
{
int hashVal = 0;
for( unsigned int i = 0; i < key.length( ); i++ )
hashVal = 37 * hashVal + key[ i ];
return hashVal;
}
/**
* A hash routine for ints.
*/
int hash( int key )
{
return key;
}
hash 函数的其他构造
unsigned long hash( const char *str )
{
unsigned long hash = 0;
int c;
while ((c = *str++))
hash = c + (hash << 6) + (hash << 16) - hash;
return hash;
}
#include <iostream>
#include "SeparateChaining.h"
using namespace std;
// Simple main
int main( )
{
HashTable<int> H;
const int NUMS = 4000;
const int GAP = 37;
int i;
cout << "Checking... (no more output means success)" << endl;
for( i = GAP; i != 0; i = ( i + GAP ) % NUMS )
H.insert( i );
for( i = 1; i < NUMS; i += 2 )
H.remove( i );
for( i = 2; i < NUMS; i += 2 )
if( !H.contains( i ) )
cout << "Contains fails " << i << endl;
for( i = 1; i < NUMS; i += 2 )
{
if( H.contains( i ) )
cout << "OOPS!!! " << i << endl;
}
return 0;
}
#ifndef QUADRATIC_PROBING_H
#define QUADRATIC_PROBING_H
#include <vector>
#include <string>
using namespace std;
int nextPrime( int n );
int hash( const string & key );
int hash( int key );
// QuadraticProbing Hash table class
//
// CONSTRUCTION: an approximate initial size or default of 101
//
// ******************PUBLIC OPERATIONS*********************
// bool insert( x ) --> Insert x
// bool remove( x ) --> Remove x
// bool contains( x ) --> Return true if x is present
// void makeEmpty( ) --> Remove all items
// int hash( string str ) --> Global method to hash strings
template <typename HashedObj>
class HashTable
{
public:
explicit HashTable( int size = 101 ) : array( nextPrime( size ) )
{ makeEmpty( ); }
bool contains( const HashedObj & x ) const
{
return isActive( findPos( x ) );
}
void makeEmpty( )
{
currentSize = 0;
for( int i = 0; i < array.size( ); i++ )
array[ i ].info = EMPTY;
}
bool insert( const HashedObj & x )
{
// Insert x as active
int currentPos = findPos( x );
if( isActive( currentPos ) )
return false;
array[ currentPos ] = HashEntry( x, ACTIVE );
// Rehash; see Section 5.5
if( ++currentSize > array.size( ) / 2 )
rehash( );
return true;
}
bool remove( const HashedObj & x )
{
int currentPos = findPos( x );
if( !isActive( currentPos ) )
return false;
array[ currentPos ].info = DELETED;
return true;
}
enum EntryType { ACTIVE, EMPTY, DELETED };
private:
struct HashEntry
{
HashedObj element;
EntryType info;
HashEntry( const HashedObj & e = HashedObj( ), EntryType i = EMPTY )
: element( e ), info( i ) { }
};
vector<HashEntry> array;
int currentSize;
bool isActive( int currentPos ) const
{ return array[ currentPos ].info == ACTIVE; }
int findPos( const HashedObj & x ) const
{
int offset = 1;
int currentPos = myhash( x );
while( array[ currentPos ].info != EMPTY &&
array[ currentPos ].element != x )
{
currentPos += offset; // Compute ith probe
offset += 2;//offset=1,3,5,7, ... 从而实现了平方探测, 因为 n^2-(n-1)^2=2n-1.
if( currentPos >= array.size( ) )
currentPos -= array.size( );
//这里保险起见可以改为 currentPos = currentPos % array.size();
//但 offset 是线性增长的, currentPos+offset 一般不会超过array.size()的两倍.
}
return currentPos;
}
void rehash( )
{
vector<HashEntry> oldArray = array;
// Create new double-sized, empty table
array.resize( nextPrime( 2 * oldArray.size( ) ) );
for( int j = 0; j < array.size( ); j++ )
array[ j ].info = EMPTY;
// Copy table over
currentSize = 0;
for( int i = 0; i < oldArray.size( ); i++ )
if( oldArray[ i ].info == ACTIVE )
insert( oldArray[ i ].element );
}
int myhash( const HashedObj & x ) const
{
int hashVal = hash( x );
hashVal %= array.size( );
if( hashVal < 0 )
hashVal += array.size( );
return hashVal;
}
};
#endif
#include "QuadraticProbing.h"
#include <iostream>
using namespace std;
/**
* Internal method to test if a positive number is prime.
* Not an efficient algorithm.
*/
bool isPrime( int n )
{
if( n == 2 || n == 3 )
return true;
if( n == 1 || n % 2 == 0 )
return false;
for( int i = 3; i * i <= n; i += 2 )
if( n % i == 0 )
return false;
return true;
}
/**
* Internal method to return a prime number at least as large as n.
* Assumes n > 0.
*/
int nextPrime( int n )
{
if( n % 2 == 0 )
n++;
for( ; !isPrime( n ); n += 2 )
;
return n;
}
/**
* A hash routine for string objects.
*/
int hash( const string & key )
{
int hashVal = 0;
for( int i = 0; i < key.length( ); i++ )
hashVal = 37 * hashVal + key[ i ];
return hashVal;
}
/**
* A hash routine for ints.
*/
int hash( int key )
{
return key;
}
#include <iostream>
#include "QuadraticProbing.h"
using namespace std;
// Simple main
int main( )
{
HashTable<int> H;
const int NUMS = 4000;
const int GAP = 37;
int i;
cout << "Checking... (no more output means success)" << endl;
int numCount=0;
int successCount=0;
for( i = GAP; i != 0; i = ( i + GAP ) % NUMS )
{
if(H.insert( i ))
{
cout << "insert " << i << endl;
successCount++;
}
numCount++;
}
cout << "We have insert " << successCount << " elements during " << numCount << " operations." << endl;
if(H.contains(4000))
{
cout << "contains 4000" << endl;
}
else
{
cout << "4000 is not in the hash table." << endl;
}
//remove the odd numbers.
for( i = 1; i < NUMS; i += 2 )
{
//if(H.remove( i ))cout << "remove " << i << " success" << endl;
//else cout << i << " is not contained in this hash table." << endl;
}
//test if even number is contained in H.
for( i = 2; i < NUMS; i +=2 )
if( !H.contains( i ) )
cout << "Contains fails " << i << endl;
for( i = 1; i < NUMS; i += 2 )
{
//if( H.contains( i ) )
// cout << "OOPS!!! " << i << endl;
}
return 0;
}
class HashEntry {
private:
int key;
int value;
public:
HashEntry(int key, int value) {
this->key = key;
this->value = value;
}
int getKey() {
return key;
}
int getValue() {
return value;
}
};
const int TABLE_SIZE = 127;//127是素数
class HashMap {
private:
HashEntry **table;//存放指针的数组
public:
HashMap() {
table = new HashEntry*[TABLE_SIZE];
for (int i = 0; i < TABLE_SIZE; i++)
table[i] = NULL;
}
int get(int key) {
//这里可以替换为其他hash函数.
int hash = (key % TABLE_SIZE);
//下面是线性探测
while (table[hash] != NULL && table[hash]->getKey() != key)
hash = (hash + 1) % TABLE_SIZE;
if (table[hash] == NULL)
return -1;
else
return table[hash]->getValue();
}
void put(int key, int value) {
int hash = (key % TABLE_SIZE);
while (table[hash] != NULL && table[hash]->getKey() != key)
hash = (hash + 1) % TABLE_SIZE;
if (table[hash] != NULL)
delete table[hash];
table[hash] = new HashEntry(key, value);
}
~HashMap() {
for (int i = 0; i < TABLE_SIZE; i++)
if (table[i] != NULL)
delete table[i];
delete[] table;
}
};
编译器使用散列表跟踪源代码中声明的变量, 这种数据结构叫作
散列表适用于任何其结点有实名而不是数字名的图论问题.
当程序搜索游戏的不同的运动路径时, 它通过计算基于位置的散列函数而跟踪一些已知的位置(并把对于该位置的移动存储起来). 如果同样的位置再次出现, 程序通常通过简单的移动变换来避免昂贵的重复计算.
游戏程序的这种一般特点叫作
如果拼写检查错误的主要功能是检查拼写错误(而非纠正错误), 那么可以预先将整个词典进行散列, 这样就可以在常数时间
假设有两个散列表, 表1 和表2, 对应两个散列函数 $h_1$, $h_2$. 六个元素 $A,B,C,D,E,F$ 在这两个散列函数下的值分别是 $(0,2)$, $(0,0)$, $(1,4)$, $(1,0)$, $(3,2)$, $(3,4)$. 这代表这些元素在两个表中初始可以放置的位置.
| $x$ | $h_1$ | $h_2$ |
|---|---|---|
| A | 0 | 2 |
| B | 0 | 0 |
| C | 1 | 4 |
| D | 1 | 0 |
| E | 3 | 2 |
| F | 3 | 4 |
| 表 1 | |
|---|---|
| 0 | |
| 1 | |
| 2 | |
| 3 | |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | |
| 3 | |
| 4 | |
插入 $x$, 首先确认它不在表中. 然后使用第一个散列函数, 如果第一个表的位置是空的, 则该项即可置入.
由于 $h_1(A)=0$, 且表1中此位置为空, 故先插入 $A$.
| 表 1 | |
|---|---|
| 0 | A |
| 1 | |
| 2 | |
| 3 | |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | |
| 3 | |
| 4 | |
由于 $(h_1(B),h_2(B))=(0,0)$, 而表1的位置$0$已被 $A$ 占用. 此时存在两种选择:
我们采用第二种做法. 注意 $h_2(A)=2$.
| 表 1 | |
|---|---|
| 0 | B |
| 1 | |
| 2 | |
| 3 | |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | A |
| 3 | |
| 4 | |
注意 $h_1(C)=1$, 而表1的位置$1$空着, 就直接置入.
| 表 1 | |
|---|---|
| 0 | B |
| 1 | C |
| 2 | |
| 3 | |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | A |
| 3 | |
| 4 | |
注意 $h_1(D)=1$, 现在表1的位置$1$被 $C$ 占据, 因此先将 $C$ 提出表1, 查询 $h_2(C)=4$. 而表2中位置$4$尚空着, 故将 $C$ 放置在表2位置$4$处.
| 表 1 | |
|---|---|
| 0 | B |
| 1 | D |
| 2 | |
| 3 | |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | A |
| 3 | |
| 4 | C |
注意 $h_1(E)=3$, 而表1的位置$3$空着, 就直接置入.
| 表 1 | |
|---|---|
| 0 | B |
| 1 | D |
| 2 | |
| 3 | E |
| 4 | |
| 表 2 | |
|---|---|
| 0 | |
| 1 | |
| 2 | A |
| 3 | |
| 4 | C |
注意 $h_1(F)=3$, 而表1的位置$3$被$E$所占, 若将$E$踢出表1至表$2$, 但 $h_2(E)=2$, 表2的位置$2$现在有元素 $A$.
既然是杜鹃散列, 插入元素的宗旨是若位置被占, 则先踢出所占元素. 因此这里的策略是不停地踢出元素到另一个表中, 直到有空位置为止.
下面列出最终的放置结果.
| 表 1 | |
|---|---|
| 0 | A |
| 1 | D |
| 2 | |
| 3 | F |
| 4 | |
| 表 2 | |
|---|---|
| 0 | B |
| 1 | |
| 2 | E |
| 3 | |
| 4 | C |
这里 $G$ 在两个散列函数下的像是 $(1,2)$. 这两个位置均有元素. 如果尝试踢换, 则
#pragma once
#include "StringHashFamily.h"
#include<random>
bool isPrime(int size)
{
for (int i = 3;i * i <= size;i += 2)
{
if (size % i == 0)
return false;
}
return true;
}
int nextPrime(int size)
{
if (size <= 2)
return 2;
if (size % 2 == 0)
size++;
while (!isPrime(size))
{
size += 2;
}
return size;
}
/*
* 为杜鹃散列生成泛型HashFamily接口
*
template<typename AnyType>
class CuckooHashFamily
{
public:
size_t hash(const AnyType& x, int which) const;
int getNumberOfFunctions();
void generateNewFunctions();
};
*/
/*
* 杜鹃散列的类接口, 允许任意个数的散列函数
*/
template <typename AnyType, typename HashFamily>
class CuckooHashTable
{
public:
explicit CuckooHashTable(int size = 101)
: array(nextPrime(size))
{
numHashFunctions = hashFunctions.getNumberOfFunctions();
rehashes = 0;
makeEmpty();
}
void makeEmpty()
{
currentSize = 0;
for (auto& entry : array)
entry.isActive = false;
}
bool contains(const AnyType& x) const
{
return findPos(x) != -1;
}
bool remove(const AnyType& x)
{
int currentPos = findPos(x);
if (!isActive(currentPos))
return false;
array[currentPos].isActive = false;
--currentSize;
return true;
}
bool insert(const AnyType& x)
{
if(contains(x))
return false;
if (currentSize >= array.size() * MAX_LOAD)
expand();
return insertHelper1(x);
}
bool insert(AnyType&& x)
{
if (contains(x))
return false;
if (currentSize >= array.size() * MAX_LOAD)
expand();
return insertHelper1(std::move(x));
}
int size() const
{
return currentSize;
}
int capacity() const
{
return array.size();
}
private:
struct HashEntry
{
AnyType element;
bool isActive;
HashEntry(const AnyType& e = AnyType(), bool a = false)
:element(e), isActive(a) {}
HashEntry(AnyType && e, bool a=false)
:element{ std::move(e) }, isActive{ a }{}
};
/**
* 杜鹃散列的插入例程使用不同的算法,
* 该算法随机选择要逐出的项,
* 但不再试图重新逐出最后的项。
* 如果存在太多的逐出项则散列表将尝试选取新的散列函数(再散列),
* 而若有太多的再散列则散列表将扩张
*/
bool insertHelper1(const AnyType& xx)
{
const int COUNT_LIMIT = 100;
AnyType x = xx;
while (true)
{
int lastPos = -1;
int pos;
for (int count = 0; count < COUNT_LIMIT; ++count)
{
for (int i = 0; i < numHashFunctions; ++i)
{
pos = myhash(x, i);
if (!isActive(pos))
{
array[pos] = std::move(HashEntry{ std::move(x),true });
++currentSize;
return true;
}
}
//无可用位置,逐出一个随机项
int i = 0;
do {
pos = myhash(x, r.nextInt(numHashFunctions));
} while (pos == lastPos && i++ < 5);
lastPos = pos;
std::swap(x, array[pos].element);
}
if (++rehashes > ALLOWED_REHASHES)
{
expand();//使散列表扩大
rehashes = 0;//重置rehashes的计数
}
else
rehash(); //表大小相同,散列函数都是新的
}
}
bool insertHelper1(AnyType&& xx)
{
const int COUNT_LIMIT = 100;
AnyType x = xx;
while (true)
{
int lastPos = -1;
int pos;
for (int count = 0; count < COUNT_LIMIT; ++count)
{
for (int i = 0; i < numHashFunctions; ++i)
{
pos = myhash(x, i);
if (!isActive(pos))
{
array[pos] = std::move(HashEntry{ std::move(x),true });
++currentSize;
return true;
}
}
//无可用位置,逐出一个随机项
int i = 0;
do {
pos = myhash(x, r.nextInt(numHashFunctions));
} while (pos == lastPos && i++ < 5);
lastPos = pos;
std::swap(x, array[pos].element);
}
if (++rehashes > ALLOWED_REHASHES)
{
expand();//使散列表扩大
rehashes = 0;//重置rehashes的计数
}
else
rehash(); //表大小相同,散列函数都是新的
}
}
bool isActive(int currentPos) const
{
return currentPos!=-1 && array[currentPos].isActive;
}
/**
* 使用特定函数计算x的散列代码
* 选取适当的散列函数,然后把它换算成合法的数组下标
*/
size_t myhash(const AnyType& x, int which) const
{
return hashFunctions.hash(x, which) % array.size();
}
/**
* 查找所有散列函数的位置
* 返回查阅所有的散列函数以返回包含项x的下标,若找不到则返回-1
*/
int findPos(const AnyType& x) const
{
for (int i = 0; i < numHashFunctions; ++i)
{
int pos = myhash(x, i);
if (isActive(pos) && array[pos].element == x)
return pos;
}
return -1;
}
/**
* 创建一个大数组但使用那些相同的散列函数
*/
void expand()
{
rehash(static_cast(array.size() / MAX_LOAD));
}
/**
* 保留数组的大小不变,创建一个新的数组
* 该数组使用那些新选出的散列函数填充
*/
void rehash()
{
hashFunctions.generateNewFunctions();
rehash(array.size());
}
void rehash(int newSize)
{
vector<HashEntry> oldArray = array;
//创建新的双倍大小的空散列表
array.resize(nextPrime(newSize));
for (auto& entry : array)
entry.isActive = false;
//复制整个表
currentSize = 0;
for (auto& entry : oldArray)
if (entry.isActive)
insert(std::move(entry.element));
}
constexpr static const double MAX_LOAD = 0.40;//最大装填因子
static const int ALLOWED_REHASHES = 5;//最大散列次数
vector<HashEntry> array;
int currentSize;
int numHashFunctions;
int rehashes;
UniformRandom r;
HashFamily hashFunctions;
};
#pragma once
#include<vector>
#include<string>
#include "UniformRandom.h"
using std::string;
using std::vector;
template<int count>
class StringHashFamily
{
public:
StringHashFamily() :MULTIPLIERS(count)
{
generateNewFunctions();
}
int getNumberOfFunctions() const
{
return count;
}
void generateNewFunctions()
{
for (auto& mult : MULTIPLIERS)
mult = r.nextInt();
}
size_t hash(const string& x, int which) const
{
const int multiplier = MULTIPLIERS[which];
size_t hashVal = 0;
for (auto ch : x)
hashVal = multiplier * hashVal + ch;
return hashVal;
}
private:
vector<int> MULTIPLIERS;//用于hash()函数的计算, n进制
UniformRandom r;
};
#pragma once
#include<time.h>
#include<random>
#include<vector>
using std::vector;
class UniformRandom
{
public:
UniformRandom()
{
}
int nextInt()
{
std::default_random_engine random(time(NULL));
std::uniform_int_distribution<int> dis1(1, 100);
return dis1(random);
}
int nextInt(int n)
{
std::default_random_engine random(time(NULL));
std::uniform_int_distribution<int> dis1(n+1, n+100);
return dis1(random);
}
double nextDouble(double s)
{
std::default_random_engine random(time(NULL));
std::uniform_real_distribution<double> dis2(s+0.0, s+1.0);
return dis2(random);
}
};
#include <iostream>
#include <string>
#include "CuckooHashTable.h"
using std::string;
int main()
{
CuckooHashTable<string, StringHashFamily<10>> H;
H.insert("hello");
H.insert("world");
//写一个测试函数.
return 0;
}