OpenNe源码解析之LINE

网络嵌入

Posted by Yunlongs on April 28, 2019

OpenNe源码解析之LINE

直接贴源码了

from __future__ import print_function
import random
import math
import numpy as np
from sklearn.linear_model import LogisticRegression
import tensorflow as tf
from .classify import Classifier, read_node_label


class _LINE(object):

    def __init__(self, graph, rep_size=128, batch_size=1000, negative_ratio=5, order=3):
        self.cur_epoch = 0   # 进行训练次数
        self.order = order   # 嵌入的阶数
        self.g = graph       # 图
        self.node_size = graph.G.number_of_nodes()  # 节点数目
        self.rep_size = rep_size    # 嵌入向量的维数
        self.batch_size = batch_size    # 每一批次的大小(没太懂这个参数的作用)
        self.negative_ratio = negative_ratio  # 采样比例
        # 赋值

        self.gen_sampling_table()  # 初始化 alias 方法的边采样表
        self.sess = tf.Session()   # TensorFlow的会话
        cur_seed = random.getrandbits(32)  # 得到32为随机的整数
        initializer = tf.contrib.layers.xavier_initializer(
            uniform=False, seed=cur_seed)  # 返回一个初始化权重的函数“xavier”,使用正态分布随机初始化
        with tf.variable_scope("model", reuse=None, initializer=initializer):  # 变量域为“model”,拒绝重用
            self.build_graph()  # 这里构建TensorFlow的图
        self.sess.run(tf.global_variables_initializer())  # 初始化并运行所有的初始化变量

    def build_graph(self):
        self.h = tf.placeholder(tf.int32, [None])
        self.t = tf.placeholder(tf.int32, [None])
        self.sign = tf.placeholder(tf.float32, [None])  # 预定义占位符

        cur_seed = random.getrandbits(32)  # 得到32为随机的整数
        self.embeddings = tf.get_variable(name="embeddings"+str(self.order), shape=[
                                          self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
        self.context_embeddings = tf.get_variable(name="context_embeddings"+str(self.order), shape=[
                                                  self.node_size, self.rep_size], initializer=tf.contrib.layers.xavier_initializer(uniform=False, seed=cur_seed))
        # 上面两行意思是:创建一个名为“embeddings+阶数”的 (node*rep)的矩阵,并使用xavier来初始化

        # self.h_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.h), 1)
        # self.t_e = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.embeddings, self.t), 1)
        # self.t_e_context = tf.nn.l2_normalize(tf.nn.embedding_lookup(self.context_embeddings, self.t), 1)

        self.h_e = tf.nn.embedding_lookup(self.embeddings, self.h)  # 得到嵌入矩阵中第h行的行向量,即顶点h的嵌入向量
        self.t_e = tf.nn.embedding_lookup(self.embeddings, self.t)  # 得到嵌入矩阵中第t行的行向量,即顶点t的嵌入向量
        self.t_e_context = tf.nn.embedding_lookup(
            self.context_embeddings, self.t)    # 得到上下文嵌入矩阵中第t行的行向量,即顶点t的上下文嵌入向量

        self.second_loss = -tf.reduce_mean(tf.log_sigmoid(
            self.sign*tf.reduce_sum(tf.multiply(self.h_e, self.t_e_context), axis=1)))
        # 二阶的损失函数(和网上看到的公式貌似不太对应)
        self.first_loss = -tf.reduce_mean(tf.log_sigmoid(
            self.sign*tf.reduce_sum(tf.multiply(self.h_e, self.t_e), axis=1)))
        # 一阶的损失函数
        if self.order == 1:  # 判断当前是要计算几阶的,并赋值
            self.loss = self.first_loss
        else:
            self.loss = self.second_loss
        optimizer = tf.train.AdamOptimizer(0.001)  # 使用adam算法来进行优化,学习速率是0.001
        self.train_op = optimizer.minimize(self.loss)  # 对损失函数进行最小化

    def train_one_epoch(self):
        sum_loss = 0.0
        batches = self.batch_iter()  # 获得h,t,sign分组
        batch_id = 0
        for batch in batches:  # 对获得的每个分组值进行迭代
            h, t, sign = batch
            feed_dict = {
                self.h: h,
                self.t: t,
                self.sign: sign,
            }  # feed用的参数表
            _, cur_loss = self.sess.run([self.train_op, self.loss], feed_dict)  # 注入参数
            sum_loss += cur_loss  # 总的损失
            batch_id += 1
        print('epoch:{} sum of loss:{!s}'.format(self.cur_epoch, sum_loss))
        self.cur_epoch += 1

    def batch_iter(self):
        look_up = self.g.look_up_dict  # 对每个节点都进行了编号的 字典

        table_size = 1e8    # 表的大小 1亿
        numNodes = self.node_size

        edges = [(look_up[x[0]], look_up[x[1]]) for x in self.g.G.edges()]  # 存储每条边(n1,n2)的编号 到edges中

        data_size = self.g.G.number_of_edges()  # 边的条数
        edge_set = set([x[0]*numNodes+x[1] for x in edges])   # 建立一个 每条边起始节点的编号*节点总数+尾结点编号 的集合
        shuffle_indices = np.random.permutation(np.arange(data_size))  #返回一个 经过打乱后的 从0--边总数  的数组

        # positive or negative mod
        mod = 0
        mod_size = 1 + self.negative_ratio  # mod_size = 6
        h = []
        t = []
        sign = 0

        start_index = 0
        end_index = min(start_index+self.batch_size, data_size)  # 确定终止索引为 分批大小和边的数目中 最小的 那个
        while start_index < data_size:
            if mod == 0:
                sign = 1.
                h = []
                t = []
                for i in range(start_index, end_index):
                    if not random.random() < self.edge_prob[shuffle_indices[i]]:  # 当选取的随机数 不小于 被打乱后节点数组中第i条边的概率时
                        shuffle_indices[i] = self.edge_alias[shuffle_indices[i]]   # 将打乱数组中这个节点 换成 alias表中 它的第二个节点
                    cur_h = edges[shuffle_indices[i]][0]
                    cur_t = edges[shuffle_indices[i]][1]  # h和t 即为随机边抽样 获得的 起始节点编号  和  尾节点编号
                    h.append(cur_h)
                    t.append(cur_t)    # 存储抽样得到的 边
            else:
                sign = -1.
                t = []
                for i in range(len(h)):
                    t.append(
                        self.sampling_table[random.randint(0, table_size-1)])

            yield h, t, [sign]
            mod += 1
            mod %= mod_size  # 即mod 为  0 1 2 3 4 5 当mod=6的时候退出循环
            if mod == 0:  # 此时循环进行了6次,该退出了
                start_index = end_index
                end_index = min(start_index+self.batch_size, data_size)

    def gen_sampling_table(self):
        table_size = 1e8  # 取样表的大小为1亿
        power = 0.75
        numNodes = self.node_size

        print("Pre-procesing for non-uniform negative sampling!")
        node_degree = np.zeros(numNodes)  # out degree

        look_up = self.g.look_up_dict
        for edge in self.g.G.edges():
            node_degree[look_up[edge[0]]
                        ] += self.g.G[edge[0]][edge[1]]["weight"]  # 计算每个节点的出度(当weight=1时)

        norm = sum([math.pow(node_degree[i], power) for i in range(numNodes)])  # 计算每个节点出度的0.75次方,并累加求和
        # 这里获得的是公式(7)中的Pn(v)


        self.sampling_table = np.zeros(int(table_size), dtype=np.uint32)  # 生成大小为1亿的0向量

        p = 0
        i = 0
        for j in range(numNodes):
            p += float(math.pow(node_degree[j], power)) / norm
            while i < table_size and float(i) / table_size < p:
                self.sampling_table[i] = j
                i += 1
                # 上面获得的是 递增的取样表


        data_size = self.g.G.number_of_edges()      # 边的个数
        self.edge_alias = np.zeros(data_size, dtype=np.int32)
        self.edge_prob = np.zeros(data_size, dtype=np.float32)
        large_block = np.zeros(data_size, dtype=np.int32)  # 存储概率比1大的
        small_block = np.zeros(data_size, dtype=np.int32)  # 存储概率比1小的

        total_sum = sum([self.g.G[edge[0]][edge[1]]["weight"]  # 计算所有边的权重的累加和
                         for edge in self.g.G.edges()])
        norm_prob = [self.g.G[edge[0]][edge[1]]["weight"] *
                     data_size/total_sum for edge in self.g.G.edges()]
        num_small_block = 0
        num_large_block = 0
        cur_small_block = 0
        cur_large_block = 0
        for k in range(data_size-1, -1, -1):  # 从datasize开始 逐个-1   到-1为止
            if norm_prob[k] < 1:
                small_block[num_small_block] = k
                num_small_block += 1
            else:
                large_block[num_large_block] = k
                num_large_block += 1                   # 以上就是存储比1大和比1小的值
        while num_small_block and num_large_block:
            num_small_block -= 1
            cur_small_block = small_block[num_small_block]  # 取出当前的小分组 的节点
            num_large_block -= 1
            cur_large_block = large_block[num_large_block]  # 取出当前的大分组 的节点
            self.edge_prob[cur_small_block] = norm_prob[cur_small_block]  # 当前这个小分组 对应的边的概率
            self.edge_alias[cur_small_block] = cur_large_block       # 找出这个小分组对应的 的alias
            norm_prob[cur_large_block] = norm_prob[cur_large_block] + \
                norm_prob[cur_small_block] - 1
            if norm_prob[cur_large_block] < 1:
                small_block[num_small_block] = cur_large_block
                num_small_block += 1
            else:
                large_block[num_large_block] = cur_large_block
                num_large_block += 1

        while num_large_block:
            num_large_block -= 1
            self.edge_prob[large_block[num_large_block]] = 1
        while num_small_block:
            num_small_block -= 1
            self.edge_prob[small_block[num_small_block]] = 1

    def get_embeddings(self):
        vectors = {}
        embeddings = self.embeddings.eval(session=self.sess)  # 运行获取嵌入矩阵
        # embeddings = self.sess.run(tf.nn.l2_normalize(self.embeddings.eval(session=self.sess), 1))
        look_back = self.g.look_back_list
        for i, embedding in enumerate(embeddings):
            vectors[look_back[i]] = embedding   # 获取每个节点的嵌入向量
        return vectors


class LINE(object):

    def __init__(self, graph, rep_size=128, batch_size=1000, epoch=10, negative_ratio=5, order=3, label_file=None, clf_ratio=0.5, auto_save=True):
        self.rep_size = rep_size
        self.order = order
        self.best_result = 0
        self.vectors = {}
        if order == 3:   # 一阶与二阶相似都进行
            self.model1 = _LINE(graph, rep_size/2, batch_size,
                                negative_ratio, order=1)
            self.model2 = _LINE(graph, rep_size/2, batch_size,
                                negative_ratio, order=2)  # 获取一阶 或者二阶 的优化模型
            for i in range(epoch):   # 进行10次 迭代
                self.model1.train_one_epoch()  # 对一阶模型训练一次
                self.model2.train_one_epoch()  # 对二阶模型训练一次
                if label_file:  # 如果有标签文件的话
                    self.get_embeddings()  # 得到嵌入的向量
                    X, Y = read_node_label(label_file)  # 读标签文件
                    print("Training classifier using {:.2f}% nodes...".format(
                        clf_ratio*100))  # 输出使用评估节点的百分比
                    clf = Classifier(vectors=self.vectors,
                                     clf=LogisticRegression())  # 初始化逻辑回归分类器
                    result = clf.split_train_evaluate(X, Y, clf_ratio)  #对结果进行评估

                    if result['macro'] > self.best_result:
                        self.best_result = result['macro']   # 存储训练结果最好的嵌入向量
                        if auto_save:
                            self.best_vector = self.vectors

        else:  # 当只进行一阶或只进行二阶时 ,与上面的相似,不再进行注释
            self.model = _LINE(graph, rep_size, batch_size,
                               negative_ratio, order=self.order)
            for i in range(epoch):
                self.model.train_one_epoch()
                if label_file:
                    self.get_embeddings()
                    X, Y = read_node_label(label_file)
                    print("Training classifier using {:.2f}% nodes...".format(
                        clf_ratio*100))
                    clf = Classifier(vectors=self.vectors,
                                     clf=LogisticRegression())
                    result = clf.split_train_evaluate(X, Y, clf_ratio)

                    if result['macro'] > self.best_result:
                        self.best_result = result['macro']
                        if auto_save:
                            self.best_vector = self.vectors

        self.get_embeddings()
        if auto_save and label_file:
            self.vectors = self.best_vector

    def get_embeddings(self):  # 得到嵌入的向量
        self.last_vectors = self.vectors
        self.vectors = {}
        if self.order == 3:  # 如果同时训练一阶和二阶的话
            vectors1 = self.model1.get_embeddings()
            vectors2 = self.model2.get_embeddings()  # 得到一阶和二阶的嵌入向量
            for node in vectors1.keys():  # 获取每个节点
                self.vectors[node] = np.append(vectors1[node], vectors2[node])  # 将一阶向量与二阶向量进行合并
        else:
            self.vectors = self.model.get_embeddings()

    def save_embeddings(self, filename): # 存储嵌入向量结果
        fout = open(filename, 'w')
        node_num = len(self.vectors.keys())
        fout.write("{} {}\n".format(node_num, self.rep_size))
        for node, vec in self.vectors.items():
            fout.write("{} {}\n".format(node,
                                        ' '.join([str(x) for x in vec])))
        fout.close()