-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtmp.py
55 lines (44 loc) · 1.66 KB
/
tmp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import numpy as np
DATA = {}
DATA['ad_id_emb'] = np.load(
'tmp/embeddings_0.npy', allow_pickle=True)
arr = DATA['ad_id_emb']
result = []
for i in range(arr.shape[-1]):
result.append([np.mean(arr[:, i]), np.std(arr[:, i])])
dfi = pd.DataFrame(result, columns=['mean', 'std'])
print(dfi.describe().T)
# from gensim.models import Word2Vec
# from gensim.models.callbacks import CallbackAny2Vec
# class LossCallback(CallbackAny2Vec):
# '''Callback to print loss after each epoch.'''
# def __init__(self):
# self.epoch = 0
# self.loss_to_be_subed = 0
# def on_epoch_end(self, model):
# loss = model.get_latest_training_loss()
# loss_now = loss - self.loss_to_be_subed
# self.loss_to_be_subed = loss
# print('Loss after epoch {}: {}'.format(self.epoch, loss_now))
# self.epoch += 1
# model = Word2Vec(common_texts, size=100, window=5, min_count=1,
# compute_loss=True, callbacks=[LossCallback()])
# tmp = df.groupby(sentence_id,
# as_index=False)[word_id].agg({list_col_nm: list})
# sentences = tmp[list_col_nm].values.tolist()
# all_words_vocabulary = df[word_id].unique().tolist()
# del tmp[list_col_nm]
# gc.collect()
# if embedding_type == 'w2v':
# model = Word2Vec(
# sentences,
# size=emb_size,
# window=150,
# workers=n_jobs,
# min_count=1, # 最低词频. min_count>1会出现OOV
# sg=sg, # 1 for skip-gram; otherwise CBOW.
# hs=hs, # If 1, hierarchical softmax will be used for model training
# negative=negative, # hs=1 + negative 负采样
# iter=epoch,
# seed=0)