glibc/db2/hash/hash_dup.c

/*-
 * See the file LICENSE for redistribution information.
 *
 * Copyright (c) 1996, 1997
 *	Sleepycat Software.  All rights reserved.
 */
/*
 * Copyright (c) 1990, 1993, 1994
 *	The Regents of the University of California.  All rights reserved.
 *
 * This code is derived from software contributed to Berkeley by
 * Margo Seltzer.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by the University of
 *	California, Berkeley and its contributors.
 * 4. Neither the name of the University nor the names of its contributors
 *    may be used to endorse or promote products derived from this software
 *    without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */
#include "config.h"

#ifndef lint
static const char sccsid[] = "@(#)hash_dup.c	10.5 (Sleepycat) 7/27/97";
#endif /* not lint */

/*
 * PACKAGE:  hashing
 *
 * DESCRIPTION:
 *      Manipulation of duplicates for the hash package.
 *
 * ROUTINES:
 *
 * External
 *      __add_dup
 * Internal
 */

#ifndef NO_SYSTEM_INCLUDES
#include <sys/types.h>

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#endif

#include "db_int.h"
#include "db_page.h"
#include "db_swap.h"
#include "hash.h"

static int __ham_check_move __P((HTAB *, HASH_CURSOR *, int32_t));
static int __ham_dup_convert __P((HTAB *, HASH_CURSOR *));
static int __ham_make_dup __P((const DBT *, DBT *d, void **, u_int32_t *));

/*
 * Called from hash_access to add a duplicate key. nval is the new
 * value that we want to add.  The flags correspond to the flag values
 * to cursor_put indicating where to add the new element.
 * There are 4 cases.
 * Case 1: The existing duplicate set already resides on a separate page.
 *	   We can use common code for this.
 * Case 2: The element is small enough to just be added to the existing set.
 * Case 3: The element is large enough to be a big item, so we're going to
 *	   have to push the set onto a new page.
 * Case 4: The element is large enough to push the duplicate set onto a
 *	   separate page.
 *
 * PUBLIC: int __ham_add_dup __P((HTAB *, HASH_CURSOR *, DBT *, int));
 */
int
__ham_add_dup(hashp, hcp, nval, flags)
	HTAB *hashp;
	HASH_CURSOR *hcp;
	DBT *nval;
	int flags;
{
	DBT pval, tmp_val;
	HKEYDATA *hk;
	u_int32_t del_len, new_size;
	int ret;

	if (flags == DB_CURRENT && hcp->dpgno == PGNO_INVALID)
		del_len = hcp->dup_len;
	else
		del_len = 0;

	if ((ret = __ham_check_move(hashp, hcp,
	    (int32_t)DUP_SIZE(nval->size) - (int32_t)del_len)) != 0)
		return (ret);

	/*
	 * Check if resulting duplicate set is going to need to go
	 * onto a separate duplicate page.  If so, convert the
	 * duplicate set and add the new one.  After conversion,
	 * hcp->dndx is the first free ndx or the index of the
	 * current pointer into the duplicate set.
	 */
	hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
	new_size = DUP_SIZE(nval->size) - del_len + LEN_HKEYDATA(hcp->pagep,
	    hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx));

	/*
	 * We convert to off-page duplicates if the item is a big item,
	 * the addition of the new item will make the set large, or
	 * if there isn't enough room on this page to add the next item.
	 */
	if (hk->type != H_OFFDUP &&
	    (hk->type == H_OFFPAGE || ISBIG(hashp, new_size) ||
	    DUP_SIZE(nval->size) - del_len > P_FREESPACE(hcp->pagep))) {

		if ((ret = __ham_dup_convert(hashp, hcp)) != 0)
			return (ret);
		else
			hk = H_PAIRDATA(hcp->pagep, hcp->bndx);
	}

	/* There are two separate cases here: on page and off page. */
	if (hk->type != H_OFFDUP) {
		if (hk->type != H_DUPLICATE) {
			hk->type = H_DUPLICATE;
			pval.flags = 0;
			pval.data = hk->data;
			pval.size = LEN_HDATA(hcp->pagep, hashp->hdr->pagesize,
			    hcp->bndx);
			if ((ret = __ham_make_dup(&pval, &tmp_val, &hcp->big_data,
			    &hcp->big_datalen)) != 0 ||
			    (ret = __ham_replpair(hashp, hcp, &tmp_val, 1)) != 0)
				return (ret);
		}

		/* Now make the new entry a duplicate. */
		if ((ret = __ham_make_dup(nval,
		    &tmp_val, &hcp->big_data, &hcp->big_datalen)) != 0)
			return (ret);

		tmp_val.dlen = 0;
		switch (flags) {			/* On page. */
		case DB_KEYFIRST:
			tmp_val.doff = 0;
			break;
		case DB_KEYLAST:
			tmp_val.doff = LEN_HDATA(hcp->pagep,
			    hashp->hdr->pagesize, hcp->bndx);
			break;
		case DB_CURRENT:
			tmp_val.doff = hcp->dup_off;
			tmp_val.dlen = DUP_SIZE(hcp->dup_len);
			break;
		case DB_BEFORE:
			tmp_val.doff = hcp->dup_off;
			break;
		case DB_AFTER:
			tmp_val.doff = hcp->dup_off + DUP_SIZE(hcp->dup_len);
			break;
		}
		/* Add the duplicate. */
		ret = __ham_replpair(hashp, hcp, &tmp_val, 0);
		if (ret == 0)
			ret = __ham_dirty_page(hashp, hcp->pagep);
		__ham_c_update(hashp, hcp, hcp->pgno, tmp_val.size, 1, 1);
		return (ret);
	}

	/* If we get here, then we're on duplicate pages. */
	if (hcp->dpgno == PGNO_INVALID) {
		memcpy(&hcp->dpgno,
		    (u_int8_t *)hk + SSZ(HOFFDUP, pgno), sizeof(db_pgno_t));
		hcp->dndx = 0;
	}

	switch (flags) {
	case DB_KEYFIRST:
		/*
		 * The only way that we are already on a dup page is
		 * if we just converted the on-page representation.
		 * In that case, we've only got one page of duplicates.
		 */
		if (hcp->dpagep == NULL && (ret =
		    __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
			return (ret);
		hcp->dndx = 0;
		break;
	case DB_KEYLAST:
		if (hcp->dpagep == NULL && (ret =
		    __db_dend(hashp->dbp, hcp->dpgno, &hcp->dpagep)) != 0)
			return (ret);
		hcp->dpgno = PGNO(hcp->dpagep);
		hcp->dndx = NUM_ENT(hcp->dpagep);
		break;
	case DB_CURRENT:
		if ((ret = __db_ditem(hashp->dbp, hcp->dpagep, hcp->dndx,
		    BKEYDATA_SIZE(GET_BKEYDATA(hcp->dpagep, hcp->dndx)->len)))
		    != 0)
			return (ret);
		break;
	case DB_BEFORE:	/* The default behavior is correct. */
		break;
	case DB_AFTER:
		hcp->dndx++;
		break;
	}

	ret = __db_dput(hashp->dbp,
	    nval, &hcp->dpagep, &hcp->dndx, __ham_overflow_page);
	hcp->pgno = PGNO(hcp->pagep);
	__ham_c_update(hashp, hcp, hcp->pgno, nval->size, 1, 1);
	return (ret);
}

/*
 * Convert an on-page set of duplicates to an offpage set of duplicates.
 */
static int
__ham_dup_convert(hashp, hcp)
	HTAB *hashp;
	HASH_CURSOR *hcp;
{
	BOVERFLOW bo;
	DBT dbt;
	HOFFPAGE ho;
	db_indx_t dndx, len;
	int ret;
	u_int8_t *p, *pend;

	/*
	 * Create a new page for the duplicates.
	 */
	if ((ret =
	    __ham_overflow_page(hashp->dbp, P_DUPLICATE, &hcp->dpagep)) != 0)
		return (ret);
	hcp->dpagep->type = P_DUPLICATE;
	hcp->dpgno = PGNO(hcp->dpagep);

	/*
	 * Now put the duplicates onto the new page.
	 */
	dbt.flags = 0;
	switch (((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->type) {
	case H_KEYDATA:
		/* Simple case, one key on page; move it to dup page. */
		dndx = 0;
		dbt.size =
		    LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx);
		dbt.data =
		    ((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->data;
		ret = __db_pitem(hashp->dbp, hcp->dpagep,
		    (u_int32_t)dndx, BKEYDATA_SIZE(dbt.size), NULL, &dbt);
		if (ret == 0)
			__ham_dirty_page(hashp, hcp->dpagep);
		break;
	case H_OFFPAGE:
		/* Simple case, one key on page; move it to dup page. */
		dndx = 0;
		memcpy(&ho,
		    P_ENTRY(hcp->pagep, H_DATAINDEX(hcp->bndx)), HOFFPAGE_SIZE);
		bo.deleted = 0;
		bo.type = ho.type;
		bo.pgno = ho.pgno;
		bo.tlen = ho.tlen;
		dbt.size = BOVERFLOW_SIZE;
		dbt.data = &bo;

		ret = __db_pitem(hashp->dbp, hcp->dpagep,
		   (u_int32_t)dndx, dbt.size, &dbt, NULL);
		if (ret == 0)
			__ham_dirty_page(hashp, hcp->dpagep);
		break;
	case H_DUPLICATE:
		p = ((HKEYDATA *)H_PAIRDATA(hcp->pagep, hcp->bndx))->data;
		pend = p +
		    LEN_HDATA(hcp->pagep, hashp->hdr->pagesize, hcp->bndx);

		for (dndx = 0; p < pend; dndx++) {
			memcpy(&len, p, sizeof(db_indx_t));
			dbt.size = len;
			p += sizeof(db_indx_t);
			dbt.data = p;
			p += len + sizeof(db_indx_t);
			ret = __db_dput(hashp->dbp, &dbt,
			    &hcp->dpagep, &dndx, __ham_overflow_page);
			if (ret != 0)
				break;
		}
		break;
	default:
		ret = __db_pgfmt(hashp->dbp, (u_long)hcp->pgno);
	}
	if (ret == 0) {
		/*
		 * Now attach this to the source page in place of
		 * the old duplicate item.
		 */
		__ham_move_offpage(hashp, hcp->pagep,
		    (u_int32_t)H_DATAINDEX(hcp->bndx), hcp->dpgno);

		/* Can probably just do a "put" here. */
		ret = __ham_dirty_page(hashp, hcp->pagep);
	} else {
		(void)__ham_del_page(hashp->dbp, hcp->dpagep);
		hcp->dpagep = NULL;
	}
	return (ret);
}

static int
__ham_make_dup(notdup, dup, bufp, sizep)
	const DBT *notdup;
	DBT *dup;
	void **bufp;
	u_int32_t *sizep;
{
	db_indx_t tsize, item_size;
	int ret;
	u_int8_t *p;

	item_size = (db_indx_t)notdup->size;
	tsize = DUP_SIZE(item_size);
	if ((ret = __ham_init_dbt(dup, tsize, bufp, sizep)) != 0)
		return (ret);

	dup->dlen = 0;
	dup->flags = notdup->flags;
	F_SET(dup, DB_DBT_PARTIAL);

	p = dup->data;
	memcpy(p, &item_size, sizeof(db_indx_t));
	p += sizeof(db_indx_t);
	memcpy(p, notdup->data, notdup->size);
	p += notdup->size;
	memcpy(p, &item_size, sizeof(db_indx_t));

	dup->doff = 0;
	dup->dlen = notdup->size;

	return (0);
}

static int
__ham_check_move(hashp, hcp, add_len)
	HTAB *hashp;
	HASH_CURSOR *hcp;
	int32_t add_len;
{
	DBT k, d;
	DB_LSN new_lsn;
	HKEYDATA *hk;
	PAGE *next_pagep;
	db_pgno_t next_pgno;
	int rectype, ret;
	u_int32_t new_datalen, old_len;

	/*
	 * Check if we can do whatever we need to on this page.  If not,
	 * then we'll have to move the current element to a new page.
	 */

	hk = H_PAIRDATA(hcp->pagep, hcp->bndx);

	/*
	 * If the item is already off page duplicates or an offpage item,
	 * then we know we can do whatever we need to do in-place
	 */
	if (hk->type == H_OFFDUP || hk->type == H_OFFPAGE)
		return (0);

	old_len =
	    LEN_HITEM(hcp->pagep, hashp->hdr->pagesize, H_DATAINDEX(hcp->bndx));
	new_datalen = old_len - HKEYDATA_SIZE(0) + add_len;

	/*
	 * We need to add a new page under two conditions:
	 * 1. The addition makes the total data length cross the BIG
	 *    threshold and the OFFDUP structure won't fit on this page.
	 * 2. The addition does not make the total data cross the
	 *    threshold, but the new data won't fit on the page.
	 * If neither of these is true, then we can return.
	 */
	if (ISBIG(hashp, new_datalen) && (old_len > HOFFDUP_SIZE ||
	    HOFFDUP_SIZE - old_len <= P_FREESPACE(hcp->pagep)))
		return (0);

	if (!ISBIG(hashp, new_datalen) &&
	    add_len <= (int32_t)P_FREESPACE(hcp->pagep))
		return (0);

	/*
	 * If we get here, then we need to move the item to a new page.
	 * Check if there are more pages in the chain.
	 */

	new_datalen = ISBIG(hashp, new_datalen) ?
	    HOFFDUP_SIZE : HKEYDATA_SIZE(new_datalen);

	next_pagep = NULL;
	for (next_pgno = NEXT_PGNO(hcp->pagep); next_pgno != PGNO_INVALID;
	    next_pgno = NEXT_PGNO(next_pagep)) {
		if (next_pagep != NULL &&
		    (ret = __ham_put_page(hashp->dbp, next_pagep, 0)) != 0)
			return (ret);

		if ((ret = __ham_get_page(hashp->dbp, next_pgno, &next_pagep)) != 0)
			return (ret);

		if (P_FREESPACE(next_pagep) >= new_datalen)
			break;
	}

	/* No more pages, add one. */
	if (next_pagep == NULL &&
	    (ret = __ham_add_ovflpage(hashp, hcp->pagep, 0, &next_pagep)) != 0)
		return (ret);

	/* Add new page at the end of the chain. */
	if (P_FREESPACE(next_pagep) < new_datalen &&
	    (ret = __ham_add_ovflpage(hashp, next_pagep, 1, &next_pagep)) != 0)
		return (ret);

	/* Copy the item to the new page. */
	if (DB_LOGGING(hashp->dbp)) {
		rectype = PUTPAIR;
		k.flags = 0;
		d.flags = 0;
		if (H_PAIRKEY(hcp->pagep, hcp->bndx)->type == H_OFFPAGE) {
			rectype |= PAIR_KEYMASK;
			k.data = H_PAIRKEY(hcp->pagep, hcp->bndx);
			k.size = HOFFPAGE_SIZE;
		} else {
			k.data = H_PAIRKEY(hcp->pagep, hcp->bndx)->data;
			k.size = LEN_HKEY(hcp->pagep,
			    hashp->hdr->pagesize, hcp->bndx);
		}

		if (hk->type == H_OFFPAGE) {
			rectype |= PAIR_DATAMASK;
			d.data = H_PAIRDATA(hcp->pagep, hcp->bndx);
			d.size = HOFFPAGE_SIZE;
		} else {
			d.data = H_PAIRDATA(hcp->pagep, hcp->bndx)->data;
			d.size = LEN_HDATA(hcp->pagep,
			    hashp->hdr->pagesize, hcp->bndx);
		}


		if ((ret = __ham_insdel_log(hashp->dbp->dbenv->lg_info,
		    (DB_TXN *)hashp->dbp->txn, &new_lsn, 0, rectype,
		    hashp->dbp->log_fileid, PGNO(next_pagep),
		    (u_int32_t)H_NUMPAIRS(next_pagep), &LSN(next_pagep),
		    &k, &d)) != 0)
			return (ret);

		/* Move lsn onto page. */
		LSN(next_pagep) = new_lsn;	/* Structure assignment. */
	}

	__ham_copy_item(hashp, hcp->pagep, H_KEYINDEX(hcp->bndx), next_pagep);
	__ham_copy_item(hashp, hcp->pagep, H_DATAINDEX(hcp->bndx), next_pagep);

	/* Now delete the pair from the current page. */
	ret = __ham_del_pair(hashp, hcp);

	(void)__ham_put_page(hashp->dbp, hcp->pagep, 1);
	hcp->pagep = next_pagep;
	hcp->pgno = PGNO(hcp->pagep);
	hcp->bndx = H_NUMPAIRS(hcp->pagep) - 1;
	F_SET(hcp, H_EXPAND);
	return (ret);
}

/*
 * Replace an onpage set of duplicates with the OFFDUP structure that
 * references the duplicate page.
 * XXX This is really just a special case of __onpage_replace; we should
 * probably combine them.
 * PUBLIC: void __ham_move_offpage __P((HTAB *, PAGE *, u_int32_t, db_pgno_t));
 */
void
__ham_move_offpage(hashp, pagep, ndx, pgno)
	HTAB *hashp;
	PAGE *pagep;
	u_int32_t ndx;
	db_pgno_t pgno;
{
	DBT new_dbt;
	DBT old_dbt;
	HOFFDUP od;
	db_indx_t i;
	int32_t shrink;
	u_int8_t *src;

	od.type = H_OFFDUP;
	od.pgno = pgno;

	if (DB_LOGGING(hashp->dbp)) {
		new_dbt.data = &od;
		new_dbt.size = HOFFDUP_SIZE;
		old_dbt.data = P_ENTRY(pagep, ndx);
		old_dbt.size = LEN_HITEM(pagep, hashp->hdr->pagesize, ndx);
		(void)__ham_replace_log(hashp->dbp->dbenv->lg_info,
		    (DB_TXN *)hashp->dbp->txn, &LSN(pagep), 0,
		    hashp->dbp->log_fileid, PGNO(pagep), (u_int32_t)ndx,
		    &LSN(pagep), -1, &old_dbt, &new_dbt, 0);
	}

	shrink =
	    LEN_HITEM(pagep, hashp->hdr->pagesize, ndx) - HOFFDUP_SIZE;

	if (shrink != 0) {
		/* Copy data. */
		src = (u_int8_t *)(pagep) + HOFFSET(pagep);
		memmove(src + shrink, src, pagep->inp[ndx] - HOFFSET(pagep));
		HOFFSET(pagep) += shrink;

		/* Update index table. */
		for (i = ndx; i < NUM_ENT(pagep); i++)
			pagep->inp[i] += shrink;
	}

	/* Now copy the offdup entry onto the page. */
	memcpy(P_ENTRY(pagep, ndx), &od, HOFFDUP_SIZE);
}