The concept of hashing is native to programming, and involves the use of mathematical functions to sort incoming data in a speedy, and organized fashion. One of the highest priorities in hashing is to organize the tables in such a fashion that we can get our performance to be as close to O(1) as possible.

In general, hashing refers to the use of a hashing function that requires a key for input. The key is a representative data structure that maps the input to its home bucket or bin, the location where it is to be stored in the table. In other words:

h(k)->{0,1,2...m-2,m-1}

where k is our key, and 0..m-1 are our buckets/bins.

Some of the ideals of hashing and hash functions are:

There is no set method or practice that will systematicially achieve a decrease in collision rate. In fact, sometimes it is desired to have more collisions to improve time complexity. Unfortunately, what is "desired" for collisions is ambiguous, and must be considered on a case to case basis.

For uniform key distribution, the condition must exist that P sub i, or the probability that h(k)=i, must be equal to 1/m, where m is the number of buckets.

Minimal key computation complexity is rather simple to understand. Simply put, we don't want to take mods of our data multiplied by irrational constants and other such nonsense. The hashing algorithm should be at least moderately intuitive.

I present here a special implementation of a hash table algorithm. I have found this one to be particularly good since it limits the worst case search behaviour and allows the hash table to fill to nearly 100% with no performance penalty.

It employs a "random walk" algorithm when inserting data that shuffles the entries in the hash table around so that there is only a small number of probes required to find any given entry.

It also has the ability to resize itself when required.

Entries that cannot fit into the hash table because of too many collisions are stored in a linear array. When the linear array gets too full and search performance begins to drop off, the hash table is made larger to accommodate more data.

This code has been compiled and tested with MS VC++ 6.0 and DJGPP 3.04.

You can download the code from http://www.cjkware.com/wamckee/hashing.zip



#include <cstdlib> // for NULL

// assumes class T has the following member functions defined:
//
//    T (T & item); // copy constructor
//    bool operator == (T & rhs); // test for equality (exact match)
//    int first_hash_key (); // must return a value between 0 and INT_MAX
//    int next_hash_key (); // must return a value between 0 and INT_MAX
//

template <class T>
class HashTable
{
private :

    // number of probes
    enum { N = 6 };
        // actual value may vary ... the smaller the value,
        // the faster the average search time
        // if the value is too small, insertion time
        // goes way up and the maximum fill % goes way down
        // (N must be >= 2 or the algorithm never converges)

    enum { NUMBER_OF_TRIES = 100 };
        // the bigger the number the higher the maximum % fill but
        // slower for worst case insertion time
        // (actual number of times may vary depending on what you
        // want the worst case insertion time to be)
        // (the bigger N is the small the number of times you need
        // to try and visa versa)
        // (could get bigger as hash_table_size increases)

    T * * hash_table;
    int hash_table_size;
    T * * overflow;
    int overflow_size;

public :

    HashTable<T> (int starting_size)
    {
        if (starting_size < 100) // sanity check
            starting_size = 100;

        hash_table = NULL;
        hash_table_size = 0;
        overflow = NULL;
        overflow_size = 0;

        resize (starting_size);
    }

    ~HashTable<T> ()
    {
        int i;

        for (i = 0; i < hash_table_size; i++)
            if (hash_table [i] != NULL)
                delete hash_table [i];
        for (i = 0; i < overflow_size; i++)
            delete overflow [i];

        delete [] hash_table;
        delete [] overflow;
    }

    T * find (T & item); // O(1) average search time

    T * insert (T & item); // O(1) average insert time

    void resize (int new_hash_table_size); // O(n) resize time (rarely called)

private :

    void test_for_resize ()
    {
        if (overflow_size >= 10) // some suitable value to limit
                    // the worst case linear search time
            resize (hash_table_size * 2); // logrithmic allocation
    }

public :

    // helper member functions for computing statistics

    // 100.0 * count () / size () = % full of the hash table
    // over () = number of entries in linear search

    int count ()
    {
        int cnt = 0;
        for (int i = 0; i < hash_table_size; i++)
            if (hash_table [i] != NULL)
                cnt ++;
        return cnt;
    }
    int size () { return hash_table_size; }
    int over () { return overflow_size; }
};

template <class T>
T * HashTable<T>::find (T & item)
{
    int i;

    int probe = item.first_hash_key () % hash_table_size;

    // if you haven't found it after N probes, you're never going
    // to find it in the hash table

    for (i = 0; i < N; i++)
    {
        if (hash_table [probe] != NULL &&
         * hash_table [probe] == item)
            return hash_table [probe];

        probe = item.next_hash_key () % hash_table_size;
    }

    // check the overflow array for the item (linear search)

    for (i = 0; i < overflow_size; i++)
    {
        if (* overflow [i] == item)
            return overflow [i];
    }

    return NULL;
}

// returns a pointer to a copy of item after insertion

template <class T>
T * HashTable<T>::insert (T & item)
{
    // search for an existing entry, if found, don't insert anything

    T * first;

    if ((first = find (item)) != NULL)
        return first;

    // make a new entry ready for insertion into the hash table

    // note: fishy allocation ... lots of little items may cause memory
    // fragmentation -- favours sparse hash tables
    // if your planning to fill the hash table, copy the item into
    // an array of T instead of allocating them one at a time

    first = new T (item);

    T * next = first;

    // try inserting NUMBER_OF_TRIES times

    for (int limit = 0; limit < NUMBER_OF_TRIES; limit++)
    {
        int probe [N];
        int i;

        // calculate the hash probes (entry locations) for
        // the new item
    
        probe [0] = next -> first_hash_key () % hash_table_size;
        for (i = 1; i < N; i++)
            probe [i] = next -> next_hash_key () % hash_table_size;

        // search for an empty entry in the hash table

        for (i = 0; i < N; i++)
        {
            if (hash_table [probe [i]] == NULL)
            {
        // if you find an empty entry in the hash table, use it and quit
                hash_table [probe [i]] = next;
                return first;
            }
        }

        // all entries (probes) for this item are full therefore ...

        // pick an random entry

        i = rand () % N;

        // swap the random entry with the current item thus saving
        // the current item and getting a new one

        T * temp = hash_table [probe [i]];
        hash_table [probe [i]] = next;
        next = temp;

        // and try again (trying to insert a new entry this time)
    }

    // create an array of items for linear searching if we fail
    // to insert into the hash table

    // note: lousy allocation strategy ... leads to memory fragmentation
    // but ideally overflow is not used that much so not big whooptie-do

    T * * new_overflow = new T * [overflow_size + 1];

    for (int i = 0; i < overflow_size; i++)
        new_overflow [i] = overflow [i];
    new_overflow [overflow_size] = next;

    delete [] overflow;

    overflow = new_overflow;
    overflow_size ++;

    test_for_resize ();

    return first;
}

// resize the hash table when overflow_size gets so large
// that the searching performance drops below a certain threshold
// (ideally overflow_size should be zero all the time)

template <class T>
void HashTable<T>::resize (int new_hash_table_size)
{
    int i;

    int old_hash_table_size = hash_table_size;
    T * * old_hash_table = hash_table;
    int old_overflow_size = overflow_size;
    T * * old_overflow = overflow;

    hash_table_size = new_hash_table_size;
    hash_table = new T * [hash_table_size];
    
    for (i = 0; i < hash_table_size; i++)
        hash_table [i] = NULL; // all entries gets set to NULL

    overflow = NULL;
    overflow_size = 0;

    for (i = 0; i < old_hash_table_size; i++)
    {
        if (old_hash_table [i] != NULL)
        {
            insert (* old_hash_table [i]);
            delete old_hash_table [i];
        }
    }

    for (i = 0; i < old_overflow_size; i++)
    {
        insert (* old_overflow [i]);
        delete old_overflow [i];
    }

    delete [] old_hash_table;
    delete [] old_overflow;
}

// sample program

// simple data type ... uses an integer as a key

class simple
{
public :

    int key;

    simple (int v)
    {
        key = v;
    }

private :

    friend class HashTable<simple>;

    // required member functions

    simple (simple & item)
    {
        key = item.key;
    }

    bool operator == (simple & rhs)
    {
        return key == rhs.key;
    }

    int next;

    int first_hash_key ()
    {
        next = key;
        return (next * 101) & 0x7fffffff;
    }
    
    int next_hash_key ()
    {
        next ++;
        return (next * 103) & 0x7fffffff;
    }
};

#include <iostream>

int main ()
{
    HashTable<simple> ht (256);

    // insert a know value

    simple x (10);
    ht.insert (x);

    std::cout << "insert: " << x.key << std::endl;

    // insert 5000 random values

    for (int i = 0; i < 5000; i++)
    {
        simple s (rand ());
        ht.insert (s);

        std::cout <<
            s.key << " " <<
            100.0 * ht.count () / ht.size () << " " <<
            ht.over () << std::endl;

    }

    // find the know value

    simple * s = ht.find (x);

    std::cout << "found: " << s->key << std::endl;

    return 0;
}


Log in or register to write something here or to contact authors.