当前位置:Gxlcms > 数据库问题 > 学习笔记TF019:序列分类、IMDB影评分类

学习笔记TF019:序列分类、IMDB影评分类

时间:2021-07-01 10:21:17 帮助过:35人阅读

tarfile import re from helpers import download class ImdbMovieReviews: DEFAULT_URL = http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz TOKEN_REGEX = re.compile(r[A-Za-z]+|[!?.:,()]) def __init__(self, cache_dir, url=None): self._cache_dir = cache_dir self._url = url or type(self).DEFAULT_URL def __iter__(self): filepath = download(self._url, self._cache_dir) with tarfile.open(filepath) as archive: for filename in archive.getnames(): if filename.startswith(aclImdb/train/pos/): yield self._read(archive, filename), True elif filename.startswith(aclImdb/train/neg/): yield self._read(archive, filename), False def _read(self, archive, filename): with archive.extractfile(filename) as file_: data = file_.read().decode(utf-8) data = type(self).TOKEN_REGEX.findall(data) data = [x.lower() for x in data] return data import bz2 import numpy as np class Embedding: def __init__(self, vocabulary_path, embedding_path, length): self._embedding = np.load(embedding_path) with bz2.open(vocabulary_path, rt) as file_: self._vocabulary = {k.strip(): i for i, k in enumerate(file_)} self._length = length def __call__(self, sequence): data = np.zeros((self._length, self._embedding.shape[1])) indices = [self._vocabulary.get(x, 0) for x in sequence] embedded = self._embedding[indices] data[:len(sequence)] = embedded return data @property def dimensions(self): return self._embedding.shape[1] import tensorflow as tf from helpers import lazy_property class SequenceClassificationModel: def __init__(self, data, target, params): self.data = data self.target = target self.params = params self.prediction self.cost self.error self.optimize @lazy_property def length(self): used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2)) length = tf.reduce_sum(used, reduction_indices=1) length = tf.cast(length, tf.int32) return length @lazy_property def prediction(self): # Recurrent network. output, _ = tf.nn.dynamic_rnn( self.params.rnn_cell(self.params.rnn_hidden), self.data, dtype=tf.float32, sequence_length=self.length, ) last = self._last_relevant(output, self.length) # Softmax layer. num_classes = int(self.target.get_shape()[1]) weight = tf.Variable(tf.truncated_normal( [self.params.rnn_hidden, num_classes], stddev=0.01)) bias = tf.Variable(tf.constant(0.1, shape=[num_classes])) prediction = tf.nn.softmax(tf.matmul(last, weight) + bias) return prediction @lazy_property def cost(self): cross_entropy = -tf.reduce_sum(self.target * tf.log(self.prediction)) return cross_entropy @lazy_property def error(self): mistakes = tf.not_equal( tf.argmax(self.target, 1), tf.argmax(self.prediction, 1)) return tf.reduce_mean(tf.cast(mistakes, tf.float32)) @lazy_property def optimize(self): gradient = self.params.optimizer.compute_gradients(self.cost) try: limit = self.params.gradient_clipping gradient = [ (tf.clip_by_value(g, -limit, limit), v) if g is not None else (None, v) for g, v in gradient] except AttributeError: print(No gradient clipping parameter specified.) optimize = self.params.optimizer.apply_gradients(gradient) return optimize @staticmethod def _last_relevant(output, length): batch_size = tf.shape(output)[0] max_length = int(output.get_shape()[1]) output_size = int(output.get_shape()[2]) index = tf.range(0, batch_size) * max_length + (length - 1) flat = tf.reshape(output, [-1, output_size]) relevant = tf.gather(flat, index) return relevant import tensorflow as tf from helpers import AttrDict from Embedding import Embedding from ImdbMovieReviews import ImdbMovieReviews from preprocess_batched import preprocess_batched from SequenceClassificationModel import SequenceClassificationModel IMDB_DOWNLOAD_DIR = ./imdb WIKI_VOCAB_DIR = ../01_wikipedia/wikipedia WIKI_EMBED_DIR = ../01_wikipedia/wikipedia params = AttrDict( rnn_cell=tf.contrib.rnn.GRUCell, rnn_hidden=300, optimizer=tf.train.RMSPropOptimizer(0.002), batch_size=20, ) reviews = ImdbMovieReviews(IMDB_DOWNLOAD_DIR) length = max(len(x[0]) for x in reviews) embedding = Embedding( WIKI_VOCAB_DIR + /vocabulary.bz2, WIKI_EMBED_DIR + /embeddings.npy, length) batches = preprocess_batched(reviews, length, embedding, params.batch_size) data = tf.placeholder(tf.float32, [None, length, embedding.dimensions]) target = tf.placeholder(tf.float32, [None, 2]) model = SequenceClassificationModel(data, target, params) sess = tf.Session() sess.run(tf.initialize_all_variables()) for index, batch in enumerate(batches): feed = {data: batch[0], target: batch[1]} error, _ = sess.run([model.error, model.optimize], feed) print({}: {:3.1f}%.format(index + 1, 100 * error))

 

参考资料:
《面向机器智能的TensorFlow实践》

欢迎加我微信交流:qingxingfengzi
我的微信公众号:qingxingfengzigz
我老婆张幸清的微信公众号:qingqingfeifangz

学习笔记TF019:序列分类、IMDB影评分类

标签:ace   hid   分布   dice   训练   range   获取   relevant   error   

人气教程排行