# Examples ## Run models on MovieLen1M in Google colab YoutubeDNN: [![Run YoutubeDNN In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_YoutubeDNN.ipynb) SDM: [![Run SDM In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_SDM.ipynb) DSSM InBatchSoftmax: [![Run DSSM InBatchSoftmax In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/shenweichen/DeepMatch/blob/master/examples/colab_MovieLen1M_DSSM_InBatchSoftmax.ipynb) ## YoutubeDNN/MIND with sampled softmax The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field. ![](../pics/movielens_sample.png) This example shows how to use ``YoutubeDNN`` to solve a matching task. You can get the demo data [movielens_sample.txt](https://github.com/shenweichen/DeepMatch/tree/master/examples/movielens_sample.txt) and run the following codes. ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat from deepmatch.models import * from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.keras import backend as K from tensorflow.keras.models import Model if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` feature_max_idx = {} for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, SEQ_LEN, 0) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 16 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len') ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] from collections import Counter train_counter = Counter(train_model_input['movie_id']) item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) # 3.Define Model and train import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) model = YoutubeDNN(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, embedding_dim), sampler_config=sampler_config) # model = MIND(user_feature_columns, item_feature_columns, dynamic_k=False, k_max=2, # user_dnn_hidden_units=(64, embedding_dim), sampler_config=sampler_config) model.compile(optimizer="adam", loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) # user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) print(user_embs.shape) print(item_embs.shape) # 5. [Optional] ANN search by faiss and evaluate the result # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss # from tqdm import tqdm # from deepmatch.utils import recall_N # # index = faiss.IndexFlatIP(embedding_dim) # # faiss.normalize_L2(item_embs) # index.add(item_embs) # # faiss.normalize_L2(user_embs) # D, I = index.search(np.ascontiguousarray(user_embs), 50) # s = [] # hit = 0 # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): # try: # pred = [item_profile['movie_id'].values[x] for x in I[i]] # filter_item = None # recall_score = recall_N(test_true_label[uid], pred, N=50) # s.append(recall_score) # if test_true_label[uid] in pred: # hit += 1 # except: # print(i) # print("recall", np.mean(s)) # print("hr", hit / len(test_user_model_input['user_id'])) ``` ## SDM with sampled softmax The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field. ![](../pics/movielens_sample.png) This example shows how to use ``SDM`` to solve a matching task. You can get the demo data [movielens_sample.txt](https://github.com/shenweichen/DeepMatch/tree/master/examples/movielens_sample.txt) and run the following codes. ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat from deepmatch.models import SDM from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set_sdm, gen_model_input_sdm from sklearn.preprocessing import LabelEncoder from tensorflow.keras import backend as K from tensorflow.keras.models import Model if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") data['genres'] = list(map(lambda x: x.split('|')[0], data['genres'].values)) sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN_short = 5 SEQ_LEN_prefer = 50 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` feature_max_idx = {} for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) # # user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set_sdm(data, seq_short_max_len=SEQ_LEN_short, seq_prefer_max_len=SEQ_LEN_prefer) train_model_input, train_label = gen_model_input_sdm(train_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) test_model_input, test_label = gen_model_input_sdm(test_set, user_profile, SEQ_LEN_short, SEQ_LEN_prefer) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 # for sdm,we must provide `VarLenSparseFeat` with name "prefer_xxx" and "short_xxx" and their length user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat("gender", feature_max_idx['gender'], 16), SparseFeat("age", feature_max_idx['age'], 16), SparseFeat("occupation", feature_max_idx['occupation'], 16), SparseFeat("zip", feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('short_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), VarLenSparseFeat(SparseFeat('short_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_short, 'mean', 'short_sess_length'), VarLenSparseFeat(SparseFeat('prefer_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN_prefer, 'mean', 'prefer_sess_length'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)] from collections import Counter train_counter = Counter(train_model_input['movie_id']) item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] sampler_config = NegativeSampler('frequency', num_sampled=5, item_name='movie_id', item_count=item_count) K.set_learning_phase(True) import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) # units must be equal to item embedding dim! model = SDM(user_feature_columns, item_feature_columns, history_feature_list=['movie_id', 'genres'], units=embedding_dim, sampler_config=sampler_config) model.compile(optimizer='adam', loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, # train_label, batch_size=512, epochs=1, verbose=1, validation_split=0.0, ) K.set_learning_phase(False) # 3.Define Model,train,predict and evaluate test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values, } user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) # user_embs = user_embs[:, i, :] # i in [0,k_max) if MIND item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) print(user_embs.shape) print(item_embs.shape) # test_true_label = {line[0]: [line[1]] for line in test_set} # # import numpy as np # import faiss # from tqdm import tqdm # from deepmatch.utils import recall_N # # index = faiss.IndexFlatIP(embedding_dim) # # faiss.normalize_L2(item_embs) # index.add(item_embs) # # faiss.normalize_L2(user_embs) # D, I = index.search(np.ascontiguousarray(user_embs), 50) # s = [] # hit = 0 # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): # try: # pred = [item_profile['movie_id'].values[x] for x in I[i]] # filter_item = None # recall_score = recall_N(test_true_label[uid], pred, N=50) # s.append(recall_score) # if test_true_label[uid] in pred: # hit += 1 # except: # print(i) # print("") # print("recall", np.mean(s)) # print("hit rate", hit / len(test_user_model_input['user_id'])) ``` ## DSSM with in batch softmax The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field. ![](../pics/movielens_sample.png) This example shows how to use ``DSSM`` to solve a matching task. You can get the demo data [movielens_sample.txt](https://github.com/shenweichen/DeepMatch/tree/master/examples/movielens_sample.txt) and run the following codes. ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat from deepmatch.models import * from deepmatch.utils import sampledsoftmaxloss, NegativeSampler from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.keras.models import Model if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 negsample = 10 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` feature_max_idx = {} for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim), SparseFeat("gender", feature_max_idx['gender'], embedding_dim), SparseFeat("age", feature_max_idx['age'], embedding_dim), SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim), SparseFeat("zip", feature_max_idx['zip'], embedding_dim), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), SparseFeat('genres', feature_max_idx['genres'], embedding_dim) ] from collections import Counter train_counter = Counter(train_model_input['movie_id']) item_count = [train_counter.get(i, 0) for i in range(item_feature_columns[0].vocabulary_size)] sampler_config = NegativeSampler('inbatch', num_sampled=5, item_name='movie_id', item_count=item_count) # 3.Define Model and train import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) model = DSSM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) # model = FM(user_feature_columns, item_feature_columns, loss_type="softmax", sampler_config=sampler_config) model.compile(optimizer='adagrad', loss=sampledsoftmaxloss) history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) print(user_embs.shape) print(item_embs.shape) # 5. [Optional] ANN search by faiss and evaluate the result # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss # from tqdm import tqdm # from deepmatch.utils import recall_N # # index = faiss.IndexFlatIP(embedding_dim) # # faiss.normalize_L2(item_embs) # index.add(item_embs) # # faiss.normalize_L2(user_embs) # D, I = index.search(user_embs, 50) # s = [] # hit = 0 # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): # try: # pred = [item_profile['movie_id'].values[x] for x in I[i]] # filter_item = None # recall_score = recall_N(test_true_label[uid], pred, N=50) # s.append(recall_score) # if test_true_label[uid] in pred: # hit += 1 # except: # print(i) # print("recall", np.mean(s)) # print("hr", hit / len(test_user_model_input['user_id'])) ``` ## DSSM with negative sampling The MovieLens data has been used for personalized tag recommendation,which contains 668, 953 tag applications of users on movies. Here is a small fraction of data include only sparse field. ![](../pics/movielens_sample.png) This example shows how to use ``DSSM`` to solve a matching task. You can get the demo data [movielens_sample.txt](https://github.com/shenweichen/DeepMatch/tree/master/examples/movielens_sample.txt) and run the following codes. ```python import pandas as pd from deepctr.feature_column import SparseFeat, VarLenSparseFeat from deepmatch.models import * from preprocess import gen_data_set, gen_model_input from sklearn.preprocessing import LabelEncoder from tensorflow.keras.models import Model if __name__ == "__main__": data = pd.read_csvdata = pd.read_csv("./movielens_sample.txt") sparse_features = ["movie_id", "user_id", "gender", "age", "occupation", "zip", "genres"] SEQ_LEN = 50 negsample = 10 # 1.Label Encoding for sparse features,and process sequence features with `gen_date_set` and `gen_model_input` feature_max_idx = {} for feature in sparse_features: lbe = LabelEncoder() data[feature] = lbe.fit_transform(data[feature]) + 1 feature_max_idx[feature] = data[feature].max() + 1 user_profile = data[["user_id", "gender", "age", "occupation", "zip"]].drop_duplicates('user_id') item_profile = data[["movie_id", "genres"]].drop_duplicates('movie_id') user_profile.set_index("user_id", inplace=True) user_item_list = data.groupby("user_id")['movie_id'].apply(list) train_set, test_set = gen_data_set(data, SEQ_LEN, negsample) train_model_input, train_label = gen_model_input(train_set, user_profile, SEQ_LEN) test_model_input, test_label = gen_model_input(test_set, user_profile, SEQ_LEN) # 2.count #unique features for each sparse field and generate feature config for sequence feature embedding_dim = 32 user_feature_columns = [SparseFeat('user_id', feature_max_idx['user_id'], 16), SparseFeat("gender", feature_max_idx['gender'], 16), SparseFeat("age", feature_max_idx['age'], 16), SparseFeat("occupation", feature_max_idx['occupation'], 16), SparseFeat("zip", feature_max_idx['zip'], 16), VarLenSparseFeat(SparseFeat('hist_movie_id', feature_max_idx['movie_id'], embedding_dim, embedding_name="movie_id"), SEQ_LEN, 'mean', 'hist_len'), VarLenSparseFeat(SparseFeat('hist_genres', feature_max_idx['genres'], embedding_dim, embedding_name="genres"), SEQ_LEN, 'mean', 'hist_len'), ] item_feature_columns = [SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim), SparseFeat('genres', feature_max_idx['genres'], embedding_dim) ] # 3.Define Model and train import tensorflow as tf if tf.__version__ >= '2.0.0': tf.compat.v1.disable_eager_execution() else: K.set_learning_phase(True) model = DSSM(user_feature_columns, item_feature_columns, loss_type="logistic") # model = FM(user_feature_columns,item_feature_columns) model.compile(optimizer='adagrad', loss="binary_crossentropy") history = model.fit(train_model_input, train_label, batch_size=256, epochs=1, verbose=1, validation_split=0.0, ) # 4. Generate user features for testing and full item features for retrieval test_user_model_input = test_model_input all_item_model_input = {"movie_id": item_profile['movie_id'].values, "genres": item_profile['genres'].values} user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding) item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding) user_embs = user_embedding_model.predict(test_user_model_input, batch_size=2 ** 12) item_embs = item_embedding_model.predict(all_item_model_input, batch_size=2 ** 12) print(user_embs.shape) print(item_embs.shape) # 5. [Optional] ANN search by faiss and evaluate the result # test_true_label = {line[0]:[line[1]] for line in test_set} # # import numpy as np # import faiss # from tqdm import tqdm # from deepmatch.utils import recall_N # # index = faiss.IndexFlatIP(embedding_dim) # # faiss.normalize_L2(item_embs) # index.add(item_embs) # # faiss.normalize_L2(user_embs) # D, I = index.search(user_embs, 50) # s = [] # hit = 0 # for i, uid in tqdm(enumerate(test_user_model_input['user_id'])): # try: # pred = [item_profile['movie_id'].values[x] for x in I[i]] # filter_item = None # recall_score = recall_N(test_true_label[uid], pred, N=50) # s.append(recall_score) # if test_true_label[uid] in pred: # hit += 1 # except: # print(i) # print("recall", np.mean(s)) # print("hr", hit / len(test_user_model_input['user_id'])) ```