withopen(dev_query_dir, 'r', encoding = 'utf-8') as f: lines = f.readlines() for i, line inenumerate(lines): query_list = line.strip().split('\t', 1) query_id, query = query_list[0], query_list[1] dev_query.loc[i, :] = [query_id, query] dict_dev_query = dict(zip(dev_query['query_id'], dev_query['query'])) withopen(dict_dev_query_dir, 'w', encoding = 'utf-8') as f: f.write(str(dict_dev_query)) dev_query = dev_query.set_index('query_id') dev_query.head()
Results
query_id
query
0
200001
甲黄酸阿怕替尼片
1
200002
索泰zbox
2
200003
kfc游戏机
3
200004
bunny成兔粮
4
200005
铁线威灵仙
1
dev_query.shape
Results:
(1000, 2)
3 Text-mining
3.1 Text preprocessing
3.1.1 cut words
1 2 3
import jieba
" ".join(jieba.cut("甲黄酸阿怕替尼片"))
Results:
'甲 黄酸 阿怕 替尼片'
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
deftitle_cut(x): returnlist(jieba.cut(x))
from joblib import Parallel, delayed
corpus_title = Parallel(n_jobs=-1)( delayed(title_cut)(title) for title in corpus["title"] ) train_title = Parallel(n_jobs=-1)( delayed(title_cut)(title) for title in train_query["query"] )
dev_title = Parallel(n_jobs=-1)( delayed(title_cut)(title) for title in dev_query["query"] )
3.1.2 Word2Vec
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
from gensim.models import Word2Vec from gensim.test.utils import common_texts
train_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in train_title] corpus_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in corpus_title] dev_w2v_ids = [[model.wv.key_to_index[xx] for xx in x] for x in dev_title]
3.1.3 IDF
1 2 3 4
from sklearn.feature_extraction.text import TfidfVectorizer
drop_token_ids = [model.wv.key_to_index[x] for x in drop_token]
3.2 无监督Word2Vec
直接构造embeding
1 2 3 4 5 6 7 8 9 10 11 12
defunsuper_w2c_encoding(s, pooling="max"): feat = [] corpus_query_word = [x for x in s if x notin drop_token_ids] iflen(corpus_query_word) == 0: return np.zeros(128) feat = model.wv[corpus_query_word]
if pooling == "max": return np.array(feat).max(0) if pooling == "avg": return np.array(feat).mean(0)
withopen(dir_query_embedding, 'w') as up : forid, feat inzip(dev_query.index, dev_mean_feat): up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:6] for x in feat]))) withopen(dir_doc_embedding, 'w') as up : forid, feat inzip(corpus.index, corpus_mean_feat): up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:6] for x in feat])))
if os.path.exists(dir_train_neg_piar): with open(dir_train_neg_piar, 'r', encoding = 'utf-8') as f: train_neg_piar = eval(f.read()) else: from tqdm import tqdm_notebook
if idx_top1_word in inverse_keyword_map: negative_idx = inverse_keyword_map[idx_top1_word][:MAX_NEG_SAMPLES] else: negative_idx = np.random.randint(corpus.shape[0], size=MAX_NEG_SAMPLES)
""" idx_keyword = [] if len(idx_top1_word) >= 2 and idx_top1_word in inverse_keyword_map: idx_keyword += inverse_keyword_map[idx_top1_word] if len(idx_start_word) >= 2 and idx_start_word in inverse_keyword_map: idx_keyword += inverse_keyword_map[idx_start_word] if len(idx_end_word) >= 2 and idx_end_word in inverse_keyword_map: idx_keyword += inverse_keyword_map[idx_end_word] negative_idx = sum(negative_idx, []) """
# negative_idx = list(set(negative_idx)) negative_idx = [x + 1 for x in negative_idx] positive_idx = qrels_train.loc[idx].ravel()[0] if positive_idx in negative_idx: negative_idx.remove(positive_idx)
train_neg_piar.append(negative_idx) with open(dir_train_neg_piar, 'w', encoding = 'utf-8') as f: f.write(str(train_neg_piar))
for idx in tqdm_notebook(range(train_query.shape[0] - 1000, train_query.shape[0] + 1)): eval_s1.append(train_query.loc[idx]["query"]) eval_s2.append(corpus.loc[qrels_train.loc[idx].ravel()[0]]["title"]) eval_socre += [1]
if idx-1 in train_neg_piar: for neg_idx in train_neg_piar[idx-1]: eval_s1.append(train_query.loc[idx]["query"]) eval_s2.append(corpus.loc[neg_idx]["title"]) eval_socre += [0] rand_idx = np.random.randint(corpus.shape[0], size=10) for neg_idx in rand_idx: eval_s1 += [train_query.loc[idx]["query"]] eval_s2 += [corpus.loc[neg_idx]["title"]] eval_socre += [0]
from torch.utils.data import DataLoader from sentence_transformers import InputExample, SentenceTransformer, losses
# Define your train dataset, the dataloader and the train loss train_size = len(train_examples) # 10000 train_dataloader = DataLoader(train_examples[:train_size], shuffle=True, batch_size=10) train_loss = losses.CosineSimilarityLoss(model)
# Tune the model model.fit( train_objectives=[(train_dataloader, train_loss)], epochs=2, warmup_steps=100, evaluator=evaluator, evaluation_steps=1000, show_progress_bar=True, output_path="./sentence-bert/", checkpoint_save_steps=10000, save_best_model=True, checkpoint_path='./sentence-bert/' )
query_sentences = list(dev_query["query"])[:query_len] corpus_sentences = list(corpus["title"].iloc[:])[:corpus_len] # corpus_sentences = [x for x in corpus_sentences if len(x) > 10]
1 2 3 4 5 6 7 8 9 10 11 12 13
from sklearn.preprocessing import normalize test_size = len(corpus_sentences)
withopen(dir_query_embedding, 'w') as up : forid, feat inzip(dev_query.index, query_embeddings): up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:4] for x in feat]))) withopen(dir_doc_embedding, 'w') as up : forid, feat inzip(corpus.index, corpus_embeddings): up.write('{0}\t{1}\n'.format(id, ','.join([str(x)[:4] for x in feat])))
defis_number(s): if s != s.strip(): returnFalse try: f = float(s) if math.isnan(f) or math.isinf(f): returnFalse returnTrue except ValueError: returnFalse
# def data_check(file, file_type="doc"): """ check if a file is UTF8 without BOM, doc_embedding index starts with 1, query_embedding index starts with 200001, the dimension of the embedding is 128. """ erro_count = [] error_embeding = [] single_error_embedding = []
for file, file_type inzip(['query_embedding', 'doc_embedding'], ['query', 'doc']): # file, file_type = "query_embedding", "query" # file, file_type = "doc_embedding", "doc" count = 1 id_set = set() withopen(file) as f: for line in f: sp_line = line.strip('\n').split("\t") iflen(sp_line) != 2: print("[Error] Please check your line. The line should be two parts, i.e. index \t embedding") print("line number: ", count) index, embedding = sp_line
ifnot is_number(index): print("[Error] Please check your id. The id should be int without other char") print("line number: ", count) id_set.add(int(index))
embedding_list = embedding.split(',') iflen(embedding_list) != 128: print("[Error] Please check the dimension of embedding. The dimension is not 128") print("line number: ", count)
for i, emb inenumerate(embedding_list): ifnot is_number(emb): print("[Error] Please check your embedding. The embedding should be float without other char") print("line number: ", count) erro_count.append([index, i]) error_embeding.append(embedding_list) single_error_embedding.append(emb)
count += 1
if file_type == "doc": # 1001501 for i inrange(1, test_size+1): if i notin id_set: print("[Error] The index[{}] of doc_embedding is not found. Please check it.".format(i)) elif file_type == "query": for i inrange(200001, 201001): if i notin id_set: print("[Error] The index[{}] of query_embedding is not found. Please check it.".format(i))