CodeForge QQ客服 CodeForge 400电话 客服电话 4006316121

HashBinaryDictionary.java ( 文件浏览 )

  • 发布于2016-05-17
  • 浏览次数:0
  • 下载次数:0
  • 下载需 1 积分
  • 侵权举报
			/**
 * Copyright 2007 The Apache Software Foundation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */
package net.paoding.analysis.dictionary;

import java.util.HashMap;
import java.util.Map;

/**
 * Dictionary的散列+二叉查找实现。
 * <p>
 * 用于对大数量的,且头字符相同的字符串较多的情况,e.g汉字词语字典。在这种情况下,检索速度将比二叉字典更快。
 * <p>
 * 
 * HashBinaryDictionary以一组已经排序的词语为输入,所有<b>头字符</b>相同的词语划为一个集合作为分字典(使用BinaryDictionary实现)。
 * 查找词语时,先根据第一个字符找得分词典(BinaryDictionary实现),再从该分词典中定位该词语。
 * <p>
 * 
 * @author Zhiliang Wang [qieqie.wang@gmail.com]
 * 
 * @see BinaryDictionary
 * 
 * @since 1.0
 * 
 */
public class HashBinaryDictionary implements Dictionary {


	// -------------------------------------------------

	/**
	 * 字典中所有词语,用于方便{
@link #get(int)
}方法
	 */
	private Word[] ascWords;

	/**
	 * 首字符到分词典的映射
	 */
	private Map/* <Object, SubDictionaryWrap> */subs;

	/**
	 * 
	 */
	private final int hashIndex;

	private final int start;
	private final int end;
	private final int count;

	// -------------------------------------------------

	/**
	 * 
	 * @param ascWords
	 *            升序排列词语
	 * @param initialCapacity
	 * @param loadFactor
	 */
	public HashBinaryDictionary(Word[] ascWords, int initialCapacity,
			float loadFactor) {

		this(ascWords, 0, 0, ascWords.length, initialCapacity, loadFactor);
	
}

	public HashBinaryDictionary(Word[] ascWords, int hashIndex, int start,
			int end, int initialCapacity, float loadFactor) {

		this.ascWords = ascWords;
		this.start = start;
		this.end = end;
		this.count = end - start;
		this.hashIndex = hashIndex;
		subs = new HashMap/* <Object, SubDictionaryWrap> */(initialCapacity,
				loadFactor);
		createSubDictionaries();
	
}

	// -------------------------------------------------

	/**
	 * 创建分词典映射,为构造函数调用
	 */
	protected void createSubDictionaries() {

		if (this.start >= ascWords.length) {

			return;
		
}
		
		// 定位相同头字符词语的开头和结束位置以确认分字典
		int beginIndex = this.start;
		int endIndex = this.start + 1;
		
		char beginHashChar = getChar(ascWords[start], hashIndex);
		char endHashChar;
		for (; endIndex < this.end; endIndex++) {

			endHashChar = getChar(ascWords[endIndex], hashIndex);
			if (endHashChar != beginHashChar) {

				addSubDictionary(beginHashChar, beginIndex, endIndex);
				beginIndex = endIndex;
				beginHashChar = endHashChar;
			
}
		
}
		addSubDictionary(beginHashChar, beginIndex, this.end);
	
}

	protected char getChar(CharSequence s, int index) {

		if (index >= s.length()) {

			return (char) 0;
		
}
		return s.charAt(index);
	
}

	/**
	 * 将位置在beginIndex和endIndex之间(不包括endIndex)的词语作为一个分词典
	 * 
	 * @param hashChar
	 * @param beginIndex
	 * @param endIndex
	 */
	protected void addSubDictionary(char hashChar, int beginIndex, int endIndex) {

		Dictionary subDic = createSubDictionary(ascWords, beginIndex, endIndex);
		SubDictionaryWrap subDicWrap = new SubDictionaryWrap(hashChar,
				subDic, beginIndex);
		subs.put(keyOf(hashChar), subDicWrap);
	
}

	protected Dictionary createSubDictionary(Word[] ascWords, int beginIndex,
			int endIndex) {

		int count = endIndex - beginIndex;
		if (count < 16) {

			return new BinaryDictionary(ascWords, beginIndex, endIndex);
		
} else {

			return new HashBinaryDictionary(ascWords, hashIndex + 1,
					beginIndex, endIndex, getCapacity(count), 0.75f);
		
}
	
}

	protected static final int[] capacityCandiate = {
 16, 32, 64, 128, 256,
			512, 1024, 2048, 4096, 10192 
};

	protected int getCapacity(int count) {

		int capacity = -1;
		count <<= 2;
		count /= 3;
		count += 1;
		for (int i = 0; i < capacityCandiate.length; i++) {

			if (count <= capacityCandiate[i]) {

				capacity = capacityCandiate[i];
				break;
			
}
		
}
		if (capacity < 0) {

			capacity = capacityCandiate[capacityCandiate.length - 1];
		
}
		return capacity;
	
}

	// -------------------------------------------------

	public Word get(int index) {

		return ascWords[start + index];
	
}

	public Hit search(CharSequence input, int begin, int count) {

		SubDictionaryWrap subDic = (SubDictionaryWrap) subs.get(keyOf(input
				.charAt(hashIndex + begin)));
		if (subDic == null) {

			return Hit.UNDEFINED;
		
}
		Dictionary dic = subDic.dic;
		// 对count==hashIndex + 1的处理
		if (count == hashIndex + 1) {

			Word header = dic.get(0);
			if (header.length() == hashIndex + 1) {

				if (subDic.wordIndexOffset + 1 < this.ascWords.length) {

					return new Hit(subDic.wordIndexOffset, header,
							this.ascWords[subDic.wordIndexOffset + 1]);
				
} else {

					return new Hit(subDic.wordIndexOffset, header, null);
				
}
			
} else {

				return new Hit(Hit.UNCLOSED_INDEX, null, header);
			
}
		
}
		// count > hashIndex + 1
		Hit word = dic.search(input, begin, count);
		if (word.isHit()) {

			int index = subDic.wordIndexOffset + word.getIndex();
			word.setIndex(index);
			if (word.getNext() == null && index < size()) {

				word.setNext(get(index + 1));
			
}
		
}
		return word;
	
}

	public int size() {

		return count;
	
}

	// -------------------------------------------------

	/**
	 * 字符的在{
@link #subs
}的key值。
	 * 
	 * @param theChar
	 * @return
	 * 
	 * @see #subs
	 */
	protected Object keyOf(char theChar) {

		// return theChar - 0x4E00;// '一'==0x4E00
		return new Integer(theChar);
	
}

	/**
	 * 分词典封箱
	 */
	static class SubDictionaryWrap {

		/**
		 * 分词典词组的头字符
		 */
		char hashChar;

		/**
		 * 分词典
		 */
		Dictionary dic;

		/**
		 * 分词典第一个词语在所有词语中的偏移位置
		 */
		int wordIndexOffset;

		public SubDictionaryWrap(char hashChar, Dictionary dic,
				int wordIndexOffset) {

			this.hashChar = hashChar;
			this.dic = dic;
			this.wordIndexOffset = wordIndexOffset;
		
}
	
}


}
			
...
展开> <收缩

下载源码到电脑,阅读使用更方便

1 积分

快速下载
还剩0行未阅读,继续阅读
Ʋ

源码文件列表

温馨提示: 点击源码文件名可预览文件内容哦 ^_^
...
名称 大小 修改日期
SimpleReadListener2.java.svn-base2.53 kB2012-10-10 10:55
SimpleReadListener.java.svn-base2.72 kB2012-10-10 10:55
ReadListener.java.svn-base936.00 B2012-10-10 10:55
FileWordsReader.java.svn-base3.95 kB2012-10-10 10:55
Difference.java.svn-base2.82 kB2012-10-10 10:55
Detector.java.svn-base3.14 kB2012-10-10 10:55
Node.java.svn-base1.88 kB2012-10-10 10:55
DifferenceListener.java.svn-base879.00 B2012-10-10 10:55
Snapshot.java.svn-base5.98 kB2012-10-10 10:55
ExtensionFileFilter.java.svn-base1.22 kB2012-10-10 10:55
Estimate.java.svn-base4.94 kB2012-10-10 10:55
TryPaodingAnalyzer.java.svn-base10.66 kB2012-10-10 10:55
MaxWordLengthTokenCollector.java.svn-base2.43 kB2012-10-10 10:55
MostWordsTokenCollector.java.svn-base2.88 kB2012-10-10 10:55
SortingDictionariesCompiler.java.svn-base7.04 kB2012-10-10 10:55
CompiledFileDictionaries.java.svn-base8.25 kB2012-10-10 10:55
MostWordsModeDictionariesCompiler.java.svn-base9.05 kB2012-10-10 10:55
all-wcprops854.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries847.00 B2012-10-10 10:55
all-wcprops1.13 kB2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries1.07 kB2012-10-10 10:55
all-wcprops449.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries555.00 B2012-10-10 10:55
TokenCollector.java.svn-base966.00 B2012-10-10 10:55
PaodingAnalyzerBean.java.svn-base4.05 kB2012-10-10 10:55
PaodingAnalyzer.java.svn-base4.46 kB2012-10-10 10:55
PaodingTokenizer.java.svn-base5.00 kB2012-10-10 10:55
all-wcprops1.03 kB2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries1.00 kB2012-10-10 10:55
PaodingAnalysisException.java.svn-base1.16 kB2012-10-10 10:55
KnifeBox.java.svn-base2.47 kB2012-10-10 10:55
LetterKnife.java.svn-base1.50 kB2012-10-10 10:55
PaodingMaker.java.svn-base21.26 kB2012-10-10 10:55
CharSet.java.svn-base2.13 kB2012-10-10 10:55
CombinatoricsKnife.java.svn-base10.65 kB2012-10-10 10:55
DictionariesCompiler.java.svn-base1.28 kB2012-10-10 10:55
FileDictionaries.java.svn-base12.74 kB2012-10-10 10:55
Dictionaries.java.svn-base1.85 kB2012-10-10 10:55
Knife.java.svn-base5.80 kB2012-10-10 10:55
Beef.java.svn-base3.84 kB2012-10-10 10:55
SmartKnifeBox.java.svn-base974.00 B2012-10-10 10:55
Collector.java.svn-base1.55 kB2012-10-10 10:55
Paoding.java.svn-base1.35 kB2012-10-10 10:55
FakeKnife.java.svn-base2.08 kB2012-10-10 10:55
CJKKnife.java.svn-base14.72 kB2012-10-10 10:55
NumberKnife.java.svn-base4.38 kB2012-10-10 10:55
DictionariesWare.java.svn-base853.00 B2012-10-10 10:55
FileDictionariesDifferenceListener.java.svn-base2.42 kB2012-10-10 10:55
CollectorStdoutImpl.java.svn-base1.18 kB2012-10-10 10:55
all-wcprops125.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries318.00 B2012-10-10 10:55
ReadListener.java936.00 B2012-10-10 10:55
FileWordsReader.java3.95 kB2012-10-10 10:55
SimpleReadListener.java2.72 kB2012-10-10 10:55
SimpleReadListener2.java2.53 kB2012-10-10 10:55
Detector.java3.14 kB2012-10-10 10:55
Node.java1.88 kB2012-10-10 10:55
ExtensionFileFilter.java1.22 kB2012-10-10 10:55
Difference.java2.82 kB2012-10-10 10:55
DifferenceListener.java879.00 B2012-10-10 10:55
Snapshot.java5.98 kB2012-10-10 10:55
HashBinaryDictionary.java.svn-base6.67 kB2012-10-10 10:55
Word.java.svn-base1.84 kB2012-10-10 10:55
Dictionary.java.svn-base1.71 kB2012-10-10 10:55
BinaryDictionary.java.svn-base3.15 kB2012-10-10 10:55
Hit.java.svn-base5.01 kB2012-10-10 10:55
DictionaryDelegate.java.svn-base1.30 kB2012-10-10 10:55
TryPaodingAnalyzer.java10.66 kB2012-10-10 10:55
Estimate.java4.94 kB2012-10-10 10:55
all-wcprops750.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries853.00 B2012-10-10 10:55
MaxWordLengthTokenCollector.java2.43 kB2012-10-10 10:55
CompiledFileDictionaries.java8.25 kB2012-10-10 10:55
SortingDictionariesCompiler.java7.04 kB2012-10-10 10:55
MostWordsModeDictionariesCompiler.java9.05 kB2012-10-10 10:55
MostWordsTokenCollector.java2.88 kB2012-10-10 10:55
all-wcprops291.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries421.00 B2012-10-10 10:55
Constants.java.svn-base4.78 kB2012-10-10 10:55
all-wcprops2.88 kB2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries2.76 kB2012-10-10 10:55
all-wcprops1.01 kB2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries1.07 kB2012-10-10 10:55
PaodingTokenizer.java5.00 kB2012-10-10 10:55
TokenCollector.java966.00 B2012-10-10 10:55
PaodingAnalyzer.java4.46 kB2012-10-10 10:55
PaodingAnalyzerBean.java4.05 kB2012-10-10 10:55
ChineseTokenizerFactory.java1.63 kB2012-10-10 11:06
SolrPaodingTokenizer.java1.09 kB2012-10-10 11:06
PaodingAnalysisException.java1.16 kB2012-10-10 10:55
all-wcprops242.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries458.00 B2012-10-10 10:55
SmartKnifeBox.java974.00 B2012-10-10 10:55
PaodingMaker.java21.26 kB2012-10-10 10:55
CJKKnife.java14.72 kB2012-10-10 10:55
DictionariesWare.java853.00 B2012-10-10 10:55
Knife.java5.80 kB2012-10-10 10:55
Collector.java1.55 kB2012-10-10 10:55
Paoding.java1.35 kB2012-10-10 10:55
FileDictionariesDifferenceListener.java2.42 kB2012-10-10 10:55
FakeKnife.java2.08 kB2012-10-10 10:55
CharSet.java2.13 kB2012-10-10 10:55
Dictionaries.java1.85 kB2012-10-10 10:55
CollectorStdoutImpl.java1.18 kB2012-10-10 10:55
FileDictionaries.java12.74 kB2012-10-10 10:55
CombinatoricsKnife.java10.65 kB2012-10-10 10:55
KnifeBox.java2.47 kB2012-10-10 10:55
Beef.java3.84 kB2012-10-10 10:55
LetterKnife.java1.50 kB2012-10-10 10:55
DictionariesCompiler.java1.28 kB2012-10-10 10:55
NumberKnife.java4.38 kB2012-10-10 10:55
BinaryDictionary.java3.15 kB2012-10-10 10:55
Dictionary.java1.71 kB2012-10-10 10:55
DictionaryDelegate.java1.30 kB2012-10-10 10:55
Word.java1.84 kB2012-10-10 10:55
HashBinaryDictionary.java6.67 kB2012-10-10 10:55
Hit.java5.01 kB2012-10-10 10:55
all-wcprops97.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries273.00 B2012-10-10 10:55
Constants.java4.78 kB2012-10-10 10:55
all-wcprops89.00 B2012-10-10 10:55
format2.00 B2012-10-10 10:55
entries264.00 B2012-10-10 10:55
readme36.00 B2012-10-11 16:29
paoding-analysis.properties187.00 B2012-10-10 10:55
paoding-analysis-default.properties220.00 B2012-10-10 10:55
paoding-analyzer.properties389.00 B2012-10-10 10:55
paoding-dic-home.properties450.00 B2012-10-11 11:36
paoding-knives.properties212.00 B2012-10-10 10:55
paoding-knives-user.properties260.00 B2012-10-10 10:55
pom.xml2.99 kB2012-10-13 14:56
zh-solr-se-solr-paoding-analysis-0.1.jar103.19 kB2012-10-13 14:19
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
01.96 kB
Ʋ

HashBinaryDictionary.java (316.60 kB)

需要 1 积分
您持有 积分

CodeForge积分(原CF币)全新升级,功能更强大,使用更便捷,不仅可以用来下载海量源代码马上还可兑换精美小礼品了 了解更多

您的积分不足

支付宝优惠套餐快速获取 30 积分

订单支付完成后,积分将自动加入到您的账号。以下是优惠期的人民币价格,优惠期过后将恢复美元价格。

更多付款方式:网银PayPal

上传代码,免费获取

您本次下载所消耗的积分将转交上传作者。

同一源码,30天内重复下载,只扣除一次积分。

登录 CodeForge

还没有CodeForge账号? 立即注册
关注微博
联系客服

Switch to the English version?

Yes
CodeForge 英文版
No
CodeForge 中文版

完善个人资料,获价值¥30元积分奖励!

^_^"呃 ...

Sorry!这位大神很神秘,未开通博客呢,请浏览一下其他的吧
好的