org.eclipse.jgit/src/org/eclipse/jgit/diff/HistogramDiffIndex.java - sop/jgit

blob: 04c79fcd1b4d0eda52435f9eeeeec81daa051d8b [file] [log] [blame]

	/*
	* Copyright (C) 2010, Google Inc.
	* and other copyright owners as documented in the project's IP log.
	*
	* This program and the accompanying materials are made available
	* under the terms of the Eclipse Distribution License v1.0 which
	* accompanies this distribution, is reproduced below, and is
	* available at http://www.eclipse.org/org/documents/edl-v10.php
	*
	* All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or
	* without modification, are permitted provided that the following
	* conditions are met:
	*
	* - Redistributions of source code must retain the above copyright
	* notice, this list of conditions and the following disclaimer.
	*
	* - Redistributions in binary form must reproduce the above
	* copyright notice, this list of conditions and the following
	* disclaimer in the documentation and/or other materials provided
	* with the distribution.
	*
	* - Neither the name of the Eclipse Foundation, Inc. nor the
	* names of its contributors may be used to endorse or promote
	* products derived from this software without specific prior
	* written permission.
	*
	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
	* CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
	* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
	* OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
	* CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
	* SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
	* NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
	* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
	* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
	* STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
	* ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
	*/

	package org.eclipse.jgit.diff;

	import org.eclipse.jgit.internal.JGitText;

	/**
	* Support {@link HistogramDiff} by computing occurrence counts of elements.
	* <p>
	* Each element in the range being considered is put into a hash table, tracking
	* the number of times that distinct element appears in the sequence. Once all
	* elements have been inserted from sequence A, each element of sequence B is
	* probed in the hash table and the longest common subsequence with the lowest
	* occurrence count in A is used as the result.
	*
	* @param <S>
	* type of the base sequence.
	*/
	final class HistogramDiffIndex<S extends Sequence> {
	private static final int REC_NEXT_SHIFT = 28 + 8;

	private static final int REC_PTR_SHIFT = 8;

	private static final int REC_PTR_MASK = (1 << 28) - 1;

	private static final int REC_CNT_MASK = (1 << 8) - 1;

	private static final int MAX_PTR = REC_PTR_MASK;

	private static final int MAX_CNT = (1 << 8) - 1;

	private final int maxChainLength;

	private final HashedSequenceComparator<S> cmp;

	private final HashedSequence<S> a;

	private final HashedSequence<S> b;

	private final Edit region;

	/** Keyed by {@link #hash(HashedSequence, int)} for {@link #recs} index. */
	private final int[] table;

	/** Number of low bits to discard from a key to index {@link #table}. */
	private final int keyShift;

	/**
	* Describes a unique element in sequence A.
	*
	* The records in this table are actually 3-tuples of:
	* <ul>
	* <li>index of next record in this table that has same hash code</li>
	* <li>index of first element in this occurrence chain</li>
	* <li>occurrence count for this element (length of locs list)</li>
	* </ul>
	*
	* The occurrence count is capped at {@link #MAX_CNT}, as the field is only
	* a few bits wide. Elements that occur more frequently will have their
	* count capped.
	*/
	private long[] recs;

	/** Number of elements in {@link #recs}; also is the unique element count. */
	private int recCnt;

	/**
	* For {@code ptr}, {@code next[ptr - ptrShift]} has subsequent index.
	*
	* For the sequence element {@code ptr}, the value stored at location
	* {@code next[ptr - ptrShift]} is the next occurrence of the exact same
	* element in the sequence.
	*
	* Chains always run from the lowest index to the largest index. Therefore
	* the array will store {@code next[1] = 2}, but never {@code next[2] = 1}.
	* This allows a chain to terminate with {@code 0}, as {@code 0} would never
	* be a valid next element.
	*
	* The array is sized to be {@code region.getLengthA()} and element indexes
	* are converted to array indexes by subtracting {@link #ptrShift}, which is
	* just a cached version of {@code region.beginA}.
	*/
	private int[] next;

	/**
	* For element {@code ptr} in A, index of the record in {@link #recs} array.
	*
	* The record at {@code recs[recIdx[ptr - ptrShift]]} is the record
	* describing all occurrences of the element appearing in sequence A at
	* position {@code ptr}. The record is needed to get the occurrence count of
	* the element, or to locate all other occurrences of that element within
	* sequence A. This index provides constant-time access to the record, and
	* avoids needing to scan the hash chain.
	*/
	private int[] recIdx;

	/** Value to subtract from element indexes to key {@link #next} array. */
	private int ptrShift;

	private Edit lcs;

	private int cnt;

	private boolean hasCommon;

	HistogramDiffIndex(int maxChainLength, HashedSequenceComparator<S> cmp,
	HashedSequence<S> a, HashedSequence<S> b, Edit r) {
	this.maxChainLength = maxChainLength;
	this.cmp = cmp;
	this.a = a;
	this.b = b;
	this.region = r;

	if (region.endA >= MAX_PTR)
	throw new IllegalArgumentException(
	JGitText.get().sequenceTooLargeForDiffAlgorithm);

	final int sz = r.getLengthA();
	final int tableBits = tableBits(sz);
	table = new int[1 << tableBits];
	keyShift = 32 - tableBits;
	ptrShift = r.beginA;

	recs = new long[Math.max(4, sz >>> 3)];
	next = new int[sz];
	recIdx = new int[sz];
	}

	Edit findLongestCommonSequence() {
	if (!scanA())
	return null;

	lcs = new Edit(0, 0);
	cnt = maxChainLength + 1;

	for (int bPtr = region.beginB; bPtr < region.endB;)
	bPtr = tryLongestCommonSequence(bPtr);

	return hasCommon && maxChainLength < cnt ? null : lcs;
	}

	private boolean scanA() {
	// Scan the elements backwards, inserting them into the hash table
	// as we go. Going in reverse places the earliest occurrence of any
	// element at the start of the chain, so we consider earlier matches
	// before later matches.
	//
	SCAN: for (int ptr = region.endA - 1; region.beginA <= ptr; ptr--) {
	final int tIdx = hash(a, ptr);

	int chainLen = 0;
	for (int rIdx = table[tIdx]; rIdx != 0;) {
	final long rec = recs[rIdx];
	if (cmp.equals(a, recPtr(rec), a, ptr)) {
	// ptr is identical to another element. Insert it onto
	// the front of the existing element chain.
	//
	int newCnt = recCnt(rec) + 1;
	if (MAX_CNT < newCnt)
	newCnt = MAX_CNT;
	recs[rIdx] = recCreate(recNext(rec), ptr, newCnt);
	next[ptr - ptrShift] = recPtr(rec);
	recIdx[ptr - ptrShift] = rIdx;
	continue SCAN;
	}

	rIdx = recNext(rec);
	chainLen++;
	}

	if (chainLen == maxChainLength)
	return false;

	// This is the first time we have ever seen this particular
	// element in the sequence. Construct a new chain for it.
	//
	final int rIdx = ++recCnt;
	if (rIdx == recs.length) {
	int sz = Math.min(recs.length << 1, 1 + region.getLengthA());
	long[] n = new long[sz];
	System.arraycopy(recs, 0, n, 0, recs.length);
	recs = n;
	}

	recs[rIdx] = recCreate(table[tIdx], ptr, 1);
	recIdx[ptr - ptrShift] = rIdx;
	table[tIdx] = rIdx;
	}
	return true;
	}

	private int tryLongestCommonSequence(final int bPtr) {
	int bNext = bPtr + 1;
	int rIdx = table[hash(b, bPtr)];
	for (long rec; rIdx != 0; rIdx = recNext(rec)) {
	rec = recs[rIdx];

	// If there are more occurrences in A, don't use this chain.
	if (recCnt(rec) > cnt) {
	if (!hasCommon)
	hasCommon = cmp.equals(a, recPtr(rec), b, bPtr);
	continue;
	}

	int as = recPtr(rec);
	if (!cmp.equals(a, as, b, bPtr))
	continue;

	hasCommon = true;
	TRY_LOCATIONS: for (;;) {
	int np = next[as - ptrShift];
	int bs = bPtr;
	int ae = as + 1;
	int be = bs + 1;
	int rc = recCnt(rec);

	while (region.beginA < as && region.beginB < bs
	&& cmp.equals(a, as - 1, b, bs - 1)) {
	as--;
	bs--;
	if (1 < rc)
	rc = Math.min(rc, recCnt(recs[recIdx[as - ptrShift]]));
	}
	while (ae < region.endA && be < region.endB
	&& cmp.equals(a, ae, b, be)) {
	if (1 < rc)
	rc = Math.min(rc, recCnt(recs[recIdx[ae - ptrShift]]));
	ae++;
	be++;
	}

	if (bNext < be)
	bNext = be;
	if (lcs.getLengthA() < ae - as \|\| rc < cnt) {
	// If this region is the longest, or there are less
	// occurrences of it in A, its now our LCS.
	//
	lcs.beginA = as;
	lcs.beginB = bs;
	lcs.endA = ae;
	lcs.endB = be;
	cnt = rc;
	}

	// Because we added elements in reverse order index 0
	// cannot possibly be the next position. Its the first
	// element of the sequence and thus would have been the
	// value of as at the start of the TRY_LOCATIONS loop.
	//
	if (np == 0)
	break TRY_LOCATIONS;

	while (np < ae) {
	// The next location to consider was actually within
	// the LCS we examined above. Don't reconsider it.
	//
	np = next[np - ptrShift];
	if (np == 0)
	break TRY_LOCATIONS;
	}

	as = np;
	}
	}
	return bNext;
	}

	private int hash(HashedSequence<S> s, int idx) {
	return (cmp.hash(s, idx) * 0x9e370001 /* mix bits */) >>> keyShift;
	}

	private static long recCreate(int next, int ptr, int cnt) {
	return ((long) next << REC_NEXT_SHIFT) //
	\| ((long) ptr << REC_PTR_SHIFT) //
	\| cnt;
	}

	private static int recNext(long rec) {
	return (int) (rec >>> REC_NEXT_SHIFT);
	}

	private static int recPtr(long rec) {
	return ((int) (rec >>> REC_PTR_SHIFT)) & REC_PTR_MASK;
	}

	private static int recCnt(long rec) {
	return ((int) rec) & REC_CNT_MASK;
	}

	private static int tableBits(final int sz) {
	int bits = 31 - Integer.numberOfLeadingZeros(sz);
	if (bits == 0)
	bits = 1;
	if (1 << bits < sz)
	bits++;
	return bits;
	}
	}