of {$slidecount} ½ {$title} ATZJG.NET {$author}

首页






散列
  • 基本思想
  • 散列函数
  • 分离链接法
  • 不使用链表的散列表
  • 再散列
  • 标准库中的散列表


Haifeng Xu


(hfxu@yzu.edu.cn)

This slide is based on the book of Mark Allen Weiss
Data Structures and Algorithm Analysis in C++
张怀勇等译.

目录

散列表(hash table)

散列表(hash table)

散列表的实现通常称为散列(hashing)

散列是一种用于以常数平均时间执行插入、删除和查找的技术.

基本思想

基本思想

理想的散列表数据结构只不过是一个包含一些项的具有固定大小的数组.

表的大小记作 TableSize.

将每个键映射到从 0TableSize-1 这个范围中的某个数, 并且将其放到适当的单元中. 这个映射就称为 散列函数(hash function).

理想情况下, 散列函数应该运算简单并且保证任何两个不同的键映射到不同的单元. 即是单射.

不过这是不可能的, 因为单元的数目是有限的, 而键实际上是用不完的.

因此, 我们寻找一个散列函数, 该函数要在单元之间均匀地分配键. 还要选择一个函数, 决定当两个键散列到同一个值的时候(称为冲突(collision))应该做什么以及如何确定散列表的大小.

散列函数

散列函数

如果键是整数, 则一般合理的方法是 $f(Key)=Key \mod TableSize$.

通常键是字符串

这种情形下, 散列函数需要仔细选择.

一种方法是将字符串中字符的 ASCII 码值加起来.

int hash( const string & key, int tableSize )
{
      int hashVal = 0;

      for( int i = 0; i < key.length( ); i++ )
          hashVal += key[ i ];

      return hashVal % tableSize;
}

不过, 如果表很大, 则函数就不会很好地分配键. 例如

由于 ASCII 字符的值最多是 127, 因此散列函数只能在 $0\sim 1016=127\times 8$ 之间取值. 这显然不是一种均匀的分配.

线性探测

线性探测

对于每个元素 $x$, 依次进行计算 $h_i(x)=(hash(x)+f(i))\mod TableSize$, 其中 $i$ 从 $0$ 开始, $f(i)=i$. 直到某个 $h_i(x)$ 处是空的, 可以存入元素为止.

平方探测

平方探测

平方探测就是上面线性探测函数中令 $f(i)=i^2$ 的处理方式.

SeparateChaining.h

SeparateChaining.h

#ifndef SEPARATE_CHAINING_H
#define SEPARATE_CHAINING_H

#include <vector>
#include <list>
#include <string>
#include <algorithm>
using namespace std;


int nextPrime( int n );

// SeparateChaining Hash table class
//
// CONSTRUCTION: an approximate initial size or default of 101
//
// ******************PUBLIC OPERATIONS*********************
// bool insert( x )       --> Insert x
// bool remove( x )       --> Remove x
// bool contains( x )     --> Return true if x is present
// void makeEmpty( )      --> Remove all items
// int hash( string str ) --> Global method to hash strings

int hash( const string & key );
int hash( int key );

template <typename HashedObj>
class HashTable
{
  public:
    explicit HashTable( int size = 101 )
      : currentSize( 0 )
      { theLists.resize( size ); }

    bool contains( const HashedObj & x ) const
    {
        const list<HashedObj> & whichList = theLists[ myhash( x ) ];
        return find( whichList.begin( ), whichList.end( ), x ) != whichList.end( );
    }

    void makeEmpty( )
    {
        for( int i = 0; i < theLists.size( ); i++ )
            theLists[ i ].clear( );
    }

    bool insert( const HashedObj & x )
    {
        list<HashedObj> & whichList = theLists[ myhash( x ) ];
        if( find( whichList.begin( ), whichList.end( ), x ) != whichList.end( ) )
            return false;
        whichList.push_back( x );

            // Rehash; see Section 5.5
        if( ++currentSize > theLists.size( ) )
            rehash( );

        return true;
    }

    bool remove( const HashedObj & x )
    {
        list<HashedObj> & whichList = theLists[ myhash( x ) ];
        typename list<HashedObj>::iterator itr = find( whichList.begin( ), whichList.end( ), x );

        if( itr == whichList.end( ) )
            return false;

        whichList.erase( itr );
        --currentSize;
        return true;
    }

  private:
    vector<list<HashedObj> > theLists;   // The array of Lists
    int  currentSize;

    void rehash( )
    {
        vector<list<HashedObj> > oldLists = theLists;

            // Create new double-sized, empty table
        theLists.resize( nextPrime( 2 * theLists.size( ) ) );
        for( unsigned int j = 0; j < theLists.size( ); j++ )
            theLists[ j ].clear( );

            // Copy table over
        currentSize = 0;
        for(unsigned int i = 0; i < oldLists.size( ); i++ )
        {
            typename list<HashedObj>::iterator itr = oldLists[ i ].begin( );
            while( itr != oldLists[ i ].end( ) )
                insert( *itr++ );
        }
    }

    int myhash( const HashedObj & x ) const
    {
        int hashVal = hash( x );

        hashVal %= theLists.size( );
        if( hashVal < 0 )
            hashVal += theLists.size( );

        return hashVal;
    }
};
#endif

SeparateChaining.cpp

SeparateChaining.cpp

#include "SeparateChaining.h"
#include <iostream>
using namespace std;


/**
 * Internal method to test if a positive number is prime.
 * Not an efficient algorithm.
 */
bool isPrime( int n )
{
    if( n == 2 || n == 3 )
        return true;

    if( n == 1 || n % 2 == 0 )
        return false;

    for( int i = 3; i * i <= n; i += 2 )
        if( n % i == 0 )
            return false;

    return true;
}

/**
 * Internal method to return a prime number at least as large as n.
 * Assumes n > 0.
 */
int nextPrime( int n )
{
    if( n % 2 == 0 )
        n++;

    for( ; !isPrime( n ); n += 2 )
        ;

    return n;
}

/**
 * A hash routine for string objects.
 */
int hash( const string & key )
{
    int hashVal = 0;

    for( unsigned int i = 0; i < key.length( ); i++ )
        hashVal = 37 * hashVal + key[ i ];

    return hashVal;
}

/**
 * A hash routine for ints.
 */
int hash( int key )
{
    return key;
}

TestSeparateChaining

TestSeparateChaining

#include <iostream>
#include "SeparateChaining.h"
using namespace std;

    // Simple main
int main( )
{
    HashTable<int> H;

    const int NUMS = 4000;
    const int GAP  =   37;
    int i;

    cout << "Checking... (no more output means success)" << endl;

    for( i = GAP; i != 0; i = ( i + GAP ) % NUMS )
        H.insert( i );
    for( i = 1; i < NUMS; i += 2 )
        H.remove( i );

    for( i = 2; i < NUMS; i += 2 )
        if( !H.contains( i ) )
            cout << "Contains fails " << i << endl;

    for( i = 1; i < NUMS; i += 2 )
    {
        if( H.contains( i ) )
            cout << "OOPS!!! " <<  i << endl;
    }

    return 0;
}

平方探测

QuadraticProbing.h

#ifndef QUADRATIC_PROBING_H
#define QUADRATIC_PROBING_H

#include <vector>
#include <string>
using namespace std;

int nextPrime( int n );
int hash( const string & key );
int hash( int key );

// QuadraticProbing Hash table class
//
// CONSTRUCTION: an approximate initial size or default of 101
//
// ******************PUBLIC OPERATIONS*********************
// bool insert( x )       --> Insert x
// bool remove( x )       --> Remove x
// bool contains( x )     --> Return true if x is present
// void makeEmpty( )      --> Remove all items
// int hash( string str ) --> Global method to hash strings

template <typename HashedObj>
class HashTable
{
  public:
    explicit HashTable( int size = 101 ) : array( nextPrime( size ) )
      { makeEmpty( ); }

    bool contains( const HashedObj & x ) const
    {
        return isActive( findPos( x ) );
    }

    void makeEmpty( )
    {
        currentSize = 0;
        for( int i = 0; i < array.size( ); i++ )
            array[ i ].info = EMPTY;
    }

    bool insert( const HashedObj & x )
    {
            // Insert x as active
        int currentPos = findPos( x );
        if( isActive( currentPos ) )
            return false;

        array[ currentPos ] = HashEntry( x, ACTIVE );

            // Rehash; see Section 5.5
        if( ++currentSize > array.size( ) / 2 )
            rehash( );

        return true;
    }

    bool remove( const HashedObj & x )
    {
        int currentPos = findPos( x );
        if( !isActive( currentPos ) )
            return false;

        array[ currentPos ].info = DELETED;
        return true;
    }

    enum EntryType { ACTIVE, EMPTY, DELETED };

  private:
    struct HashEntry
    {
        HashedObj element;
        EntryType info;

        HashEntry( const HashedObj & e = HashedObj( ), EntryType i = EMPTY )
          : element( e ), info( i ) { }
    };

    vector<HashEntry> array;
    int currentSize;

    bool isActive( int currentPos ) const
      { return array[ currentPos ].info == ACTIVE; }

    int findPos( const HashedObj & x ) const
    {
        int offset = 1;
        int currentPos = myhash( x );

        while( array[ currentPos ].info != EMPTY &&
                array[ currentPos ].element != x )
        {
            currentPos += offset;  // Compute ith probe
            offset += 2;
            if( currentPos >= array.size( ) )
                currentPos -= array.size( );
        }

        return currentPos;
    }

    void rehash( )
    {
        vector<HashEntry> oldArray = array;

            // Create new double-sized, empty table
        array.resize( nextPrime( 2 * oldArray.size( ) ) );
        for( int j = 0; j < array.size( ); j++ )
            array[ j ].info = EMPTY;

            // Copy table over
        currentSize = 0;
        for( int i = 0; i < oldArray.size( ); i++ )
            if( oldArray[ i ].info == ACTIVE )
                insert( oldArray[ i ].element );
    }
    int myhash( const HashedObj & x ) const
    {
        int hashVal = hash( x );

        hashVal %= array.size( );
        if( hashVal < 0 )
            hashVal += array.size( );

        return hashVal;
    }
};

#endif

平方探测

QuadraticProbing.cpp

#include "QuadraticProbing.h"
#include <iostream>
using namespace std;

/**
 * Internal method to test if a positive number is prime.
 * Not an efficient algorithm.
 */
bool isPrime( int n )
{
    if( n == 2 || n == 3 )
        return true;

    if( n == 1 || n % 2 == 0 )
        return false;

    for( int i = 3; i * i <= n; i += 2 )
        if( n % i == 0 )
            return false;

    return true;
}

/**
 * Internal method to return a prime number at least as large as n.
 * Assumes n > 0.
 */
int nextPrime( int n )
{
    if( n % 2 == 0 )
        n++;

    for( ; !isPrime( n ); n += 2 )
        ;

    return n;
}

/**
 * A hash routine for string objects.
 */
int hash( const string & key )
{
    int hashVal = 0;

    for( int i = 0; i < key.length( ); i++ )
        hashVal = 37 * hashVal + key[ i ];

    return hashVal;
}

/**
 * A hash routine for ints.
 */
int hash( int key )
{
    return key;
}

平方探测

TestQuadraticProbing.cpp

#include <iostream>
#include "QuadraticProbing.h"
using namespace std;

    // Simple main
int main( )
{
    HashTable<int> H;

    const int NUMS = 4000;
    const int GAP  =   37;
    int i;

    cout << "Checking... (no more output means success)" << endl;

    int numCount=0;
    int successCount=0;
    for( i = GAP; i != 0; i = ( i + GAP ) % NUMS )
    {
        if(H.insert( i ))
        {
            cout << "insert " << i << endl;
            successCount++;
        }
        numCount++;
    }
    cout << "We have insert " << successCount <<  " elements during " << numCount << " operations." << endl;
    if(H.contains(4000))
    {
        cout << "contains 4000" << endl;
    }
    else 
    {
        cout << "4000 is not in the hash table." << endl;
    }

    //remove the odd numbers.
    for( i = 1; i < NUMS; i += 2 )
    {
        //if(H.remove( i ))cout << "remove " << i << " success" << endl;
        //else cout << i << " is not contained in this hash table." << endl;
    }

    //test if even number is contained in H.
    for( i = 2; i < NUMS; i +=2 )
        if( !H.contains( i ) )
            cout  <<  "Contains fails "  <<  i  <<  endl;

    for( i = 1; i < NUMS; i += 2 )
    {
        //if( H.contains( i ) )
          //  cout  <<  "OOPS!!! "  <<   i  <<  endl;
    }

    return 0;
}

HashEntry

HashEntry

class HashEntry {

private:
      int key;
      int value;

public:
      HashEntry(int key, int value) {
            this->key = key;
            this->value = value;
      }

      int getKey() {
            return key;
      }

      int getValue() {
            return value;
      }

};

const int TABLE_SIZE = 128;

class HashMap {

private:
      HashEntry **table;

public:
      HashMap() {
            table = new HashEntry*[TABLE_SIZE];
            for (int i = 0; i < TABLE_SIZE; i++)
                  table[i] = NULL;
      }

      int get(int key) {
            int hash = (key % TABLE_SIZE);
            while (table[hash] != NULL && table[hash]->getKey() != key)
                  hash = (hash + 1) % TABLE_SIZE;

            if (table[hash] == NULL)
                  return -1;
            else
                  return table[hash]->getValue();

      }

      void put(int key, int value) {
            int hash = (key % TABLE_SIZE);
            while (table[hash] != NULL && table[hash]->getKey() != key)
                  hash = (hash + 1) % TABLE_SIZE;

            if (table[hash] != NULL)
                  delete table[hash];
            table[hash] = new HashEntry(key, value);
      }     

      ~HashMap() {
            for (int i = 0; i < TABLE_SIZE; i++)
                  if (table[i] != NULL)
                        delete table[i];

            delete[] table;
      }

};

散列的应用

散列的应用

编译器

编译器使用散列表跟踪源代码中声明的变量, 这种数据结构叫作符号表(symbol table).

图论问题

散列表适用于任何其结点有实名而不是数字名的图论问题.

为游戏编制的程序

当程序搜索游戏的不同的运动路径时, 它通过计算基于位置的散列函数而跟踪一些已知的位置(并把对于该位置的移动存储起来). 如果同样的位置再次出现, 程序通常通过简单的移动变换来避免昂贵的重复计算.

游戏程序的这种一般特点叫作置换表(transposition table).

在线拼写检查程序

如果拼写检查错误的主要功能是检查拼写错误(而非纠正错误), 那么可以预先将整个词典进行散列, 这样就可以在常数时间 $O(1)$ 内检查单词拼写.

End






Thanks very much!

This slide is based on Jeffrey D. Ullman's work, which can be download from his website.