辞書アプリの改良:複数辞書対応+α

改良点

  • 複数辞書対応(同階層にある*.metaを全てメタインデックスファイルと見なす)
  • ゲンゾウさん添削の取り込み
  • 1インデックス内に同一連続文字があると検索結果が重複するバグを修正
  • インデックスファイル、メタインデックスファイルのopen/closeのスコープを縮小(パフォーマンスにほぼ差異なし)

ソース

def dictionaries = []

new File(".").eachFileMatch(~".*.meta") {
	dictionaries <<  new Dictionary(it.name.substring(0, it.name.length()-5))
}

key = args[0]
println "search: ${key}"

def timer = new Timer("All")
def resultMap = [:]
dictionaries.each {dictionary ->
	def resultList = dictionary.search(key)
	resultList.each {idx ->
		resultMap.get(idx.word, []) << [d:dictionary, i:idx]
	}
}
resultMap.keySet().sort({l,r -> l.compareTo(r)}).each {word ->
	println word
	if (resultMap.size() <= 100) {
		resultMap.get(word).each {map ->
			println "=> " + map.d.getDefinition(map.i)
		}
	}
}
println "count:${resultMap.size()}" 
timer.stop()

class Dictionary extends BaseDictionary {
	def meta
	def file
	def dictFile
	def Dictionary(String fileName) {
		meta = new MetaDictionary(fileName)
		file = new File(fileName + ".idx")
		dictFile = new File(fileName + ".dict")
	}
	
	def search(key) {
		def timer = new Timer("meta search")
		def keyFragments = cut(key)
		def metaIndexesList = []
		for (fragment in keyFragments) {
			def t = new Timer("meta search each")
			metaIndexesList << meta.search(fragment.word)
			t.stop()
		}
		timer.stop()
		if (metaIndexesList.empty) return []
		
		timer = new Timer("weak intersection")
		// 計算量が少なくなるよう、数が少ない順に並び替え
		metaIndexesList.sort {l, r -> return r.size() <=> l.size()}
		def andList = metaIndexesList.get(0).clone()
		(1..<metaIndexesList.size()).each{i->
			def mapA = [:]
			def mapB = [:]
			metaIndexesList.get(i).each {metaIndex -> mapA.get(metaIndex.offset, []) << metaIndex}
			andList.each {metaIndex ->
					if (mapA.containsKey(metaIndex.offset)) mapB.get(metaIndex.offset, []) << metaIndex
				}
			def tmpList = []
			mapB.each {mapKey, value ->
				tmpList.addAll(mapA.get(mapKey))
				tmpList.addAll(value)
			}
			andList = tmpList
		}
		timer.stop()
		
		// 文字の出現位置をチェック
		timer = new Timer("filter")
		def map = [:]
		andList.each { map.get(it.offset, []) << it }
		
		def resultMap = [:]
		map.keySet().each {offsetKey ->
				def metaList = map.get(offsetKey)
				def baseList = metaList.findAll {x -> x.word == keyFragments[0].word} //検索文字の最初の2文字を基準にする
				baseList.each {base ->
						def flag = true
						for (fragment in keyFragments) {
							flag = flag && 
								metaList.any {
									x -> fragment.word == x.word && (x.wordOffset - base.wordOffset) == fragment.wordOffset
								}
						}
						if (flag) resultMap.get(base.offset, base)
					}
			}
		timer.stop()
		def resultList = resultMap.collect {k, v -> getIndex(v.offset, v.length)}
		resultList.sort {l, r-> l.word <=> r.word}
		return resultList
	}
	
	def append(Map map, MetaIndex idx) {
		if (map.containsKey(idx.offset)) {
			map.get(idx.offset) << idx
		} else {
			map.put(idx.offset, [idx])
		}
	}
	
	def cut(string) {
		if (string.length() < MetaIndex.KEY_CHAR_LENGTH) return [new MetaIndex(string, 0, 0, 0)]
		def list = []
		int i=0
		while (i <string.length()-MetaIndex.KEY_CHAR_LENGTH +1) {
			list << new MetaIndex(string.substring(i, i+MetaIndex.KEY_CHAR_LENGTH), 0, 0, i)
			i++
		}
		return list
	}
	
	def getIndex(int offset, int length) {
		return new Index(read(file, offset, length), 0, length)
	}
	
	def getDefinition(index) {
		return new String(read(dictFile, index.offset, index.length), "UTF-8")
	}
}

class MetaDictionary extends BaseDictionary {
	def file = null
	int count = 0
	def MetaDictionary(fileName) {
		file = new File(fileName + ".meta")
		count = file.length() / MetaIndex.LENGTH
	}

	def search(key) {
		assert (key.length() <= MetaIndex.KEY_CHAR_LENGTH)
		int start = searchStart(key, 0, count)
		int end = searchEnd(key, start, count)
		if (start > end) return loadIndex(0, 0)
		return loadIndex(start, (end-start+1))
	}

	def getIndex(int i) {
		return new MetaIndex(read(file, i*MetaIndex.LENGTH, MetaIndex.LENGTH))
	}
	
	def loadIndex(indexOffset = 0, indexCount = -1) {
		if (indexOffset < 0) indexOffset = 0
		if (indexCount == -1 || file.length() < (indexCount * MetaIndex.LENGTH)) indexCount = count
		byte[] buf = read(file, indexOffset * MetaIndex.LENGTH, indexCount * MetaIndex.LENGTH)

		def indexes = (0..<indexCount).collect {
			new MetaIndex(buf, it * MetaIndex.LENGTH)
		}
		return indexes
	}
	def searchStart(key, int start, int end) {
		if (start == end || end < 0) return start
		int i = (int)Math.floor((start+end)/2)
		def idx = getIndex(i)
		def c = idx.compareTo(key)
//		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"
		if (c >= 0) {
			return searchStart(key, start, i)
		} else {
			if ((end-start)==1) i = end
			return searchStart(key, i, end)
		}
	}
	def searchEnd(key, start, end) {
		if (start == end || end < 0) return end
		def i = (int)Math.ceil((start+end)/2)
		if (i>end) i = end
		def idx = getIndex(i)
		def c = idx.compareTo(key)
//		println "start:${start}, end:${end}, i:${i}, c:${c}, idx:${idx}, key:${key}, key:${key.getBytes()}"
		if (c <= 0) {
			return searchEnd(key, i, end)
		} else {
			if ((end-start)==1) i = start
			return searchEnd(key, start, i)
		}
	}
}

class BaseDictionary {
	byte[] read(File file, long offset, long length) {
		byte[] buf = new byte[length]
		def raf = new RandomAccessFile(file, "r")
		raf.seek(offset)
		raf.read(buf)
		raf.close()
		return buf
	}
}

class Timer {
	def name
	def time = new Date()
	def Timer(n) {name = n}
	def stop() {
		println "TIME[${name}]:${(new Date().getTime() - time.getTime())}" 
	}
}

実行結果

2つの辞書にあるキーワードがなかなか見つからん。
やっぱりこのタイ語辞書は変な言葉ばかりだ

% ./dict.sh 辞書  
search: 辞書
TIME[meta search each]:225
TIME[meta search]:259
TIME[weak intersection]:49
TIME[filter]:130
TIME[meta search each]:44
TIME[meta search]:44
TIME[weak intersection]:1
TIME[filter]:5
この辞書は、ソムシィー先生によって編纂されました。
=> &#3614;&#3592;&#3609;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;&#3648;&#3621;&#3656;&#3617;&#3609;&#3637;&#3657;&#3649;&#3605;&#3656;&#3591;&#3650;&#3604;&#3618;&#3629;&#3634;&#3592;&#3634;&#3619;&#3618;&#3660;&#3626;&#3617;&#3624;&#3619;&#3637;
どんな辞書を使っているの?
=> &#3651;&#3594;&#3657;&#3614;&#3592;&#3609;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;&#3649;&#3610;&#3610;&#3652;&#3627;&#3609;
はい。(男性の丁寧語・若い人は、ほとんどR音を発音しません、Rを発音しない方が多数派です。本辞書は一応、khrapと正式な発音記号をいれています)
=> &#3588;&#3619;&#3633;&#3610;
アルファベットの辞書
=> &#3629;&#3633;&#3585;&#3586;&#3619;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;
ウエブスター(1758〜1843米国の辞書編纂家)
=> &#3648;&#3623;&#3655;&#3610;&#3626;&#3648;&#3605;&#3629;&#3619;&#3660;
人名辞書
=> [じんめいじしょ] /biographical dictionary/
何冊(本・書籍・辞書・雑誌・ノート)
=> &#3648;&#3621;&#3656;&#3617;
別の辞書に当たって見る
=> [べつのじしょにあたってみる] /(exp) to try another dictionary/
和英辞書
=> [わえいじしょ] /(n) Japanese-English dictionary/
日葡辞書
=> [にっぽじしょ] /Japanese-Portuguese dictionary/
英々辞書
=> [えいえいじしょ] /(n) English-English dictionary/
英英辞書
=> [えいえいじしょ] /(n) English-English dictionary/
辞書
=> [じしょ] /(n) dictionary/lexicon/(P)/
=> &#3604;&#3636;&#3588;, &#3614;&#3592;&#3609;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;, &#3624;&#3633;&#3614;&#3607;&#3660;&#3649;&#3626;&#3591;
辞書に拠れば
=> [じしょによれば] /(exp) based on (according to) the dictionary/
辞書を出す
=> [じしょをだす] /(exp) to publish a dictionary/
辞書を引く
=> &#3648;&#3611;&#3636;&#3604;&#3604;&#3636;&#3588;
辞書を引くのは面倒だ。
=> &#3585;&#3634;&#3619;&#3592;&#3632;&#3648;&#3611;&#3636;&#3604;&#3614;&#3592;&#3609;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;&#3648;&#3611;&#3655;&#3609;&#3648;&#3619;&#3639;&#3656;&#3629;&#3591;&#3618;&#3640;&#3656;&#3591;&#3618;&#3634;&#3585;
辞書を繰る
=> [じしょをくる] /(exp) to consult a dictionary/
辞書部門
=> [じしょぶもん] /(n) lexicon/
辞書類
=> [じしょるい] /(n) dictionaries (and similar books)/
電子辞書
=> [でんしじしょ] /electronic dictionary/
=> &#3614;&#3592;&#3609;&#3634;&#3609;&#3640;&#3585;&#3619;&#3617;&#3629;&#3636;&#3648;&#3621;&#3655;&#3585;&#3607;&#3619;&#3629;&#3609;&#3636;&#3585;&#3626;&#3660;
count:21
TIME[All]:770

メタインデックス作成ツールも対象ファイルを指定できるよう修正。

Indexer.groovy

fileName = args[0]
workDir = new File("work")
deleteTmpFile()
workDir.mkdirs()

def indexes = loadCompleteIndex()

List metaList = []
int i=0
for (idx in indexes) {
	metaList.addAll(idx.toMetaIndexList())
	
	i++
	if (i%1000 == 0) {
		makeTmpFile(metaList)
		metaList = []
		println ("parsing:" + (i*100/indexes.size()) + "%")
	}
}
makeTmpFile(metaList)
makeMetaFile()

def makeMetaFile() {
	File outFile = new File(fileName + ".meta")
	outFile.delete()
	outFile.createNewFile()
	def os = new BufferedOutputStream(new FileOutputStream(outFile))
	listTmpFiles().each {f ->
			println ("writing: " + hex2str(f.name) + ":" + f.length() + ":" + (f.length() / MetaIndex.LENGTH))
			def buf = f.readBytes()
			def list = []
			def i = 0
			while ((i*MetaIndex.LENGTH) < buf.length) {
				list << new MetaIndex(buf, i*MetaIndex.LENGTH)
				i++
			}
			list.sort()
			i = 0
			for (m in list) {
				os.write(m.toBytes())
				i++
			}
		}
	os.close()
}

def listTmpFiles() {
	def list = workDir.listFiles().toList()
	return list.sort {l,r ->
			return hex2str(l.name).compareTo(hex2str(r.name))
		}
}

def makeTmpFile(metaList) {
	metaList.sort()
	def prev = str2Hex("a")
	def tmpFile = new BufferedOutputStream(new FileOutputStream(new File(workDir, prev), true))
	for (m in metaList) {
		def first = str2Hex(m.word.substring(0,1))
		if (prev != first) {
			prev = first
			tmpFile.close()
			tmpFile = new BufferedOutputStream(new FileOutputStream(new File(workDir, prev), true))
		}
		tmpFile.write(m.toBytes())
	}
	tmpFile.close()
}

def str2Hex(String val) {
	byte[]  bytes = val.getBytes()
	def result = ""
	bytes.each {
		result += Integer.toHexString(it & 0xff)
	}
	return result
}
def hex2str(val) {
	byte[] buf = new byte[(val.length() / 2)]
	int i=0
	while (val.length() > 0) {
		buf[i] = Byte.parseByte(val.substring(0, 1), 16) * 16 + Byte.parseByte(val.substring(1, 2), 16)
		val = val.substring(2)
		i++
	}
	return new String(buf, "UTF-8")
}

def deleteTmpFile() {
	workDir.listFiles().each {
		it.delete()
	}
	workDir.delete()
}

def loadCompleteIndex(indexOffset = 0, indexLength = -1) {
	if (indexOffset < 0) indexOffset = 0
	RandomAccessFile randomFile = new RandomAccessFile(fileName + ".idx", "r")
	if (indexLength == -1) indexLength = randomFile.length()
	def buf = new byte[indexLength]
	randomFile.seek(indexOffset)
	randomFile.read(buf)
	randomFile.close()

	def i=0
	int offset = 0
	int count = 0
	def indexes = new ArrayList();
	while (i<buf.length) {
		if (buf[i] == 0) {
			def idx = new Index(buf, offset, i-offset+9)
			if (idx.word.length() > 0) indexes.add(idx)
			offset = i+9
			i = i+8
			count++
		}
		i++
		if (i%200000 == 0) println ("loading:" + (i*100/(indexLength)) + "%")
	}
	return indexes
}