您可以让计算速度更快,首先建立一个索引,然后只对包含常用单词的组合运行get_ngram_overlap()
:
import numpy as np
from itertools import combinations
# make the items in corpus frozensets (to make them hashable)
corpus = [frozenset({'example', 'bigram'}), frozenset({'another', 'example'}), frozenset({'some', 'outlier'})]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
print(sim_matrix)
打印:
[[1. 0.5 0. ]
[0. 1. 0. ]
[0. 0. 1. ]]
一个包含10k个包含500个不同单词的二元语法的基准:
import random
import numpy as np
from timeit import timeit
from itertools import combinations
random.seed(123)
corpus = [frozenset({f'word{random.randint(1, 500)}', f'word{random.randint(1, 500)}'}) for _ in range(10_000)]
def get_ngram_overlap(ngrams_s1, ngrams_s2):
mn = min(len(ngrams_s1), len(ngrams_s2))
if mn == 0:
return 0
common_ngrams = ngrams_s1 & ngrams_s2
return len(common_ngrams)/mn
def fn1():
sim_matrix = np.zeros((len(corpus), len(corpus)))
for i in range(len(corpus)):
for j in range(i+1, len(corpus)):
sim_matrix[i][j] = get_ngram_overlap(corpus[i], corpus[j])
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
def fn2():
index, nums = {}, {}
for i, b in enumerate(corpus):
for word in b:
index.setdefault(word, []).append(b)
nums.setdefault(b, []).append(i)
index2 = {}
for k, v in index.items():
for c in combinations(v, 2):
index2[c] = get_ngram_overlap(*c)
sim_matrix = np.zeros((len(corpus), len(corpus)))
for a, b in index2:
for x in nums[a]:
for y in nums[b]:
sim_matrix[(x, y) if x < y else (y, x)] = index2[(a, b)]
np.fill_diagonal(sim_matrix, 1)
return sim_matrix
assert np.array_equal(fn1(), fn2())
t1 = timeit(fn1, number=1)
t2 = timeit(fn2, number=1)
print(t1)
print(t2)
我的机器上的 fingerprint (Python3.10/AMD Ryzen 5700x):
19.253960204077885
0.37714757514186203