tarfile
import re
from helpers
import download
class ImdbMovieReviews:
DEFAULT_URL =
‘http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz‘
TOKEN_REGEX = re.compile(r
‘[A-Za-z]+|[!?.:,()]‘)
def __init__(self, cache_dir, url=
None):
self._cache_dir =
cache_dir
self._url = url
or type(self).DEFAULT_URL
def __iter__(self):
filepath =
download(self._url, self._cache_dir)
with tarfile.open(filepath) as archive:
for filename
in archive.getnames():
if filename.startswith(
‘aclImdb/train/pos/‘):
yield self._read(archive, filename), True
elif filename.startswith(
‘aclImdb/train/neg/‘):
yield self._read(archive, filename), False
def _read(self, archive, filename):
with archive.extractfile(filename) as file_:
data = file_.read().decode(
‘utf-8‘)
data =
type(self).TOKEN_REGEX.findall(data)
data = [x.lower()
for x
in data]
return data
import bz2
import numpy as np
class Embedding:
def __init__(self, vocabulary_path, embedding_path, length):
self._embedding =
np.load(embedding_path)
with bz2.open(vocabulary_path, ‘rt‘) as file_:
self._vocabulary = {k.strip(): i
for i, k
in enumerate(file_)}
self._length =
length
def __call__(self, sequence):
data = np.zeros((self._length, self._embedding.shape[1
]))
indices = [self._vocabulary.get(x, 0)
for x
in sequence]
embedded =
self._embedding[indices]
data[:len(sequence)] =
embedded
return data
@property
def dimensions(self):
return self._embedding.shape[1
]
import tensorflow as tf
from helpers
import lazy_property
class SequenceClassificationModel:
def __init__(self, data, target, params):
self.data =
data
self.target =
target
self.params =
params
self.prediction
self.cost
self.error
self.optimize
@lazy_property
def length(self):
used = tf.sign(tf.reduce_max(tf.abs(self.data), reduction_indices=2
))
length = tf.reduce_sum(used, reduction_indices=1
)
length =
tf.cast(length, tf.int32)
return length
@lazy_property
def prediction(self):
# Recurrent network.
output, _ =
tf.nn.dynamic_rnn(
self.params.rnn_cell(self.params.rnn_hidden),
self.data,
dtype=
tf.float32,
sequence_length=
self.length,
)
last =
self._last_relevant(output, self.length)
# Softmax layer.
num_classes = int(self.target.get_shape()[1
])
weight =
tf.Variable(tf.truncated_normal(
[self.params.rnn_hidden, num_classes], stddev=0.01
))
bias = tf.Variable(tf.constant(0.1, shape=
[num_classes]))
prediction = tf.nn.softmax(tf.matmul(last, weight) +
bias)
return prediction
@lazy_property
def cost(self):
cross_entropy = -tf.reduce_sum(self.target *
tf.log(self.prediction))
return cross_entropy
@lazy_property
def error(self):
mistakes =
tf.not_equal(
tf.argmax(self.target, 1), tf.argmax(self.prediction, 1
))
return tf.reduce_mean(tf.cast(mistakes, tf.float32))
@lazy_property
def optimize(self):
gradient =
self.params.optimizer.compute_gradients(self.cost)
try:
limit =
self.params.gradient_clipping
gradient =
[
(tf.clip_by_value(g, -
limit, limit), v)
if g
is not None
else (None, v)
for g, v
in gradient]
except AttributeError:
print(
‘No gradient clipping parameter specified.‘)
optimize =
self.params.optimizer.apply_gradients(gradient)
return optimize
@staticmethod
def _last_relevant(output, length):
batch_size =
tf.shape(output)[0]
max_length = int(output.get_shape()[1
])
output_size = int(output.get_shape()[2
])
index = tf.range(0, batch_size) * max_length + (length - 1
)
flat = tf.reshape(output, [-1
, output_size])
relevant =
tf.gather(flat, index)
return relevant
import tensorflow as tf
from helpers
import AttrDict
from Embedding
import Embedding
from ImdbMovieReviews
import ImdbMovieReviews
from preprocess_batched
import preprocess_batched
from SequenceClassificationModel
import SequenceClassificationModel
IMDB_DOWNLOAD_DIR =
‘./imdb‘
WIKI_VOCAB_DIR =
‘../01_wikipedia/wikipedia‘
WIKI_EMBED_DIR =
‘../01_wikipedia/wikipedia‘
params =
AttrDict(
rnn_cell=
tf.contrib.rnn.GRUCell,
rnn_hidden=300
,
optimizer=tf.train.RMSPropOptimizer(0.002
),
batch_size=20
,
)
reviews =
ImdbMovieReviews(IMDB_DOWNLOAD_DIR)
length = max(len(x[0])
for x
in reviews)
embedding =
Embedding(
WIKI_VOCAB_DIR +
‘/vocabulary.bz2‘,
WIKI_EMBED_DIR +
‘/embeddings.npy‘, length)
batches =
preprocess_batched(reviews, length, embedding, params.batch_size)
data =
tf.placeholder(tf.float32, [None, length, embedding.dimensions])
target = tf.placeholder(tf.float32, [None, 2
])
model =
SequenceClassificationModel(data, target, params)
sess =
tf.Session()
sess.run(tf.initialize_all_variables())
for index, batch
in enumerate(batches):
feed = {data: batch[0], target: batch[1
]}
error, _ =
sess.run([model.error, model.optimize], feed)
print(
‘{}: {:3.1f}%‘.format(index + 1, 100 * error))
参考资料:
《面向机器智能的TensorFlow实践》
欢迎加我微信交流:qingxingfengzi
我的微信公众号:qingxingfengzigz
我老婆张幸清的微信公众号:qingqingfeifangz
学习笔记TF019:序列分类、IMDB影评分类
标签:ace hid 分布 dice 训练 range 获取 relevant error