tensorflow 資料集對象（tf.data）的使用( tf.data.Dataset 、tf.data.TextLineDataset

tensorflow 使用資料集（tf.data）的方法對資料集進行操縱。

1. 對數組（記憶體向量）進行操縱：

import tensorflow as tf

input_data = [1, 2, 3, 4, 5]

#從數組生成資料集
dataset = tf.data.Dataset.from_tensor_slices(input_data)

#dataset = dataset.shuffle(3)
#dataset = dataset.repeat(10)
#dataset = dataset.batch(2)
dataset = dataset.shuffle(3).repeat(10).batch(2)


# 定義疊代器。
iterator = dataset.make_one_shot_iterator()

# get_next() 傳回代表一個輸入資料的張量(batch)。
x = iterator.get_next()
y = x * x


coord=tf.train.Coordinator()
with tf.Session() as sess:
    for i in range(25):
        print(sess.run(y))

2. 讀取文本檔案裡的資料（ tf.data.TextLineDataset ）

import tensorflow as tf


# 建立文本檔案作為本例的輸入。
with open("./test1.txt", "w") as file:
    file.write("File1, line1.\n") 
    file.write("File1, line2.\n")
    file.write("File1, line3.\n")
    file.write("File1, line4.\n")
    file.write("File1, line5.\n")


with open("./test2.txt", "w") as file:
    file.write("File2, line1.\n") 
    file.write("File2, line2.\n")
    file.write("File2, line3.\n")
    file.write("File2, line4.\n")
    file.write("File2, line5.\n")


# 從文本檔案建立資料集。這裡可以提供多個檔案。
input_files = ["./test1.txt", "./test2.txt"]
dataset = tf.data.TextLineDataset(input_files)
#dataset = dataset.shuffle(3).repeat(2).batch(2)


# 定義疊代器。
iterator = dataset.make_one_shot_iterator()


# 這裡get_next()傳回一個字元串類型的張量，代表檔案中的一行。
x = iterator.get_next()  
with tf.Session() as sess:
    for i in range(10):
        print(sess.run(x))

3. 解析TFRecord檔案裡的資料

準備工作：（mnist資料集的tfrecord格式的儲存）

import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import numpy as np

def _float32_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

mnist=input_data.read_data_sets('./data', dtype=tf.uint8, one_hot=True)
"""
print(mnist.train.images)
print(mnist.train.labels)
print(mnist.test.images)
print(mnist.test.labels)
"""
train_images=mnist.train.images
train_labels=mnist.train.labels
#test_images=mnist.test.images
#test_labels=mnist.test.labels

train_num=mnist.train.num_examples 
#test_num=mnist.test.num_examples 


pixels=train_images.shape[1]   # 784 = 28*28


file_out='./data/output.tfrecords'
writer=tf.python_io.TFRecordWriter(file_out)


for index in range(train_num):
    image_raw=train_images[index].tostring() #轉換為bytes序列   

    example=tf.train.Example(features=tf.train.Features(feature={
               'pixels': _int64_feature(pixels),
               'label':_int64_feature(np.argmax(train_labels[index])),
               'x':_float32_feature(0.1),
               'y':_bytes_feature(bytes('abcde', 'utf-8')),
               'image_raw':_bytes_feature(image_raw)}))

    writer.write(example.SerializeToString())
writer.close()

準備工作：（mnist資料集的tfrecord格式的讀取）

import tensorflow as tf

reader=tf.TFRecordReader()

files=tf.train.match_filenames_once('./data/output.*')

#filename_queue=tf.train.string_input_producer(['./data/output.tfrecords'])
filename_queue=tf.train.string_input_producer(files)

_, serialized_example=reader.read(filename_queue)

features=tf.parse_single_example(serialized_example,
                   features={
                           'image_raw':tf.FixedLenFeature([], tf.string),
                           'pixels':tf.FixedLenFeature([], tf.int64),
                           'label':tf.FixedLenFeature([], tf.int64),
                           'x':tf.FixedLenFeature([], tf.float32),
                           'y':tf.FixedLenFeature([], tf.string)
                            })

#print(features['image_raw'])    # tensor string （bytes tensor      string tensor）

# necessary operation
# bytes_list   to   uint8_list
image=tf.decode_raw(features['image_raw'], tf.uint8) 
#print(image)    # tensor uint8

x=features['x']

#y=tf.cast(features['y'], tf.string)
y=features['y']

label=tf.cast(features['label'], tf.int32)
pixels=tf.cast(features['pixels'], tf.int32)

#image.set_shape([pixels**0.5, pixels**0.5])
image.set_shape([784])



batch_size=2
image_batch, label_batch, pixels_batch, x_batch, y_batch=tf.train.batch([image, label, pixels,x,y], batch_size=batch_size, capacity=1000+3*batch_size)




coord=tf.train.Coordinator()

with tf.Session() as sess:
    sess.run(tf.local_variables_initializer())
    threads=tf.train.start_queue_runners(sess=sess, coord=coord)


    for i in range(1):
        print(sess.run([image_batch, label_batch, pixels_batch, x_batch, y_batch]))


    coord.request_stop()
    coord.join(threads)

正式工作：（mnist資料集的tfrecord格式使用 TFRecordDataset 資料集讀取）

import tensorflow as tf

files=tf.gfile.Glob('./data/output.*')

dataset = tf.data.TFRecordDataset(files)

def parser(record):
    features=tf.parse_single_example(record,
                   features={
                           'image_raw':tf.FixedLenFeature([], tf.string),
                           'pixels':tf.FixedLenFeature([], tf.int64),
                           'label':tf.FixedLenFeature([], tf.int64),
                           'x':tf.FixedLenFeature([], tf.float32),
                           'y':tf.FixedLenFeature([], tf.string)
                            })

    #print(features['image_raw'])    # tensor string （bytes tensor      string tensor）

    # necessary operation
    # bytes_list   to   uint8_list
    image=tf.decode_raw(features['image_raw'], tf.uint8) 
    #print(image)    # tensor uint8

    x=features['x']

    #y=tf.cast(features['y'], tf.string)
    y=features['y']

    label=tf.cast(features['label'], tf.int32)
    pixels=tf.cast(features['pixels'], tf.int32)

    #image.set_shape([pixels**0.5, pixels**0.5])
    image.set_shape([784])
    return image, label, pixels, x, y


# map()函數表示對資料集中的每一條資料進行調用解析方法。
dataset = dataset.map(parser)

# dataset 資料集操縱
dataset = dataset.shuffle(3).repeat(2).batch(2)

# 定義周遊資料集的疊代器。
iterator = dataset.make_one_shot_iterator()

# 讀取資料，可用于進一步計算
image, label, pixels, x, y = iterator.get_next()

with tf.Session() as sess:
    for i in range(1):
        print(sess.run([image, label, pixels, x, y]))

tensorflow 資料集對象（tf.data）的使用( tf.data.Dataset 、tf.data.TextLineDataset

4. 使用 initializable_iterator 來動态初始化資料集

# 從TFRecord檔案建立資料集，具體檔案路徑是一個placeholder，稍後再提供具體路徑。
input_files = tf.placeholder(tf.string)
dataset = tf.data.TFRecordDataset(input_files)
dataset = dataset.map(parser)

# 定義周遊dataset的initializable_iterator。
iterator = dataset.make_initializable_iterator()
image, label = iterator.get_next()

with tf.Session() as sess:
    # 首先初始化iterator，并給出input_files的值。
    sess.run(iterator.initializer,
             feed_dict={input_files: ["output.tfrecords"]})
    # 周遊所有資料一個epoch。當周遊結束時，程式會抛出OutOfRangeError。
    while True:
        try:
            x, y = sess.run([image, label])
        except tf.errors.OutOfRangeError:
            break

詳細例子：

import tensorflow as tf

files=tf.placeholder(tf.string)

dataset = tf.data.TFRecordDataset(files)

def parser(record):
    features=tf.parse_single_example(record,
                   features={
                           'image_raw':tf.FixedLenFeature([], tf.string),
                           'pixels':tf.FixedLenFeature([], tf.int64),
                           'label':tf.FixedLenFeature([], tf.int64),
                           'x':tf.FixedLenFeature([], tf.float32),
                           'y':tf.FixedLenFeature([], tf.string)
                            })

    #print(features['image_raw'])    # tensor string （bytes tensor      string tensor）

    # necessary operation
    # bytes_list   to   uint8_list
    image=tf.decode_raw(features['image_raw'], tf.uint8) 
    #print(image)    # tensor uint8

    x=features['x']

    #y=tf.cast(features['y'], tf.string)
    y=features['y']

    label=tf.cast(features['label'], tf.int32)
    pixels=tf.cast(features['pixels'], tf.int32)

    #image.set_shape([pixels**0.5, pixels**0.5])
    image.set_shape([784])
    return image, label, pixels, x, y


# map()函數表示對資料集中的每一條資料進行調用解析方法。
dataset = dataset.map(parser)

# dataset 資料集操縱
dataset = dataset.shuffle(3).repeat(2).batch(2)

# 定義周遊資料集的疊代器。
#iterator = dataset.make_one_shot_iterator()
# 定義周遊dataset的initializable_iterator。
iterator = dataset.make_initializable_iterator()

# 讀取資料，可用于進一步計算
image, label, pixels, x, y = iterator.get_next()

with tf.Session() as sess:
    # 首先初始化iterator，并給出input_files的值。
    sess.run(iterator.initializer,
             feed_dict={files: ["data/output.tfrecords"]})
    for i in range(1):
        print(sess.run([image, label, pixels, x, y]))

或（修改版）：

import tensorflow as tf

files=tf.train.match_filenames_once('./data/output.*')

dataset = tf.data.TFRecordDataset(files)

def parser(record):
    features=tf.parse_single_example(record,
                   features={
                           'image_raw':tf.FixedLenFeature([], tf.string),
                           'pixels':tf.FixedLenFeature([], tf.int64),
                           'label':tf.FixedLenFeature([], tf.int64),
                           'x':tf.FixedLenFeature([], tf.float32),
                           'y':tf.FixedLenFeature([], tf.string)
                            })

    #print(features['image_raw'])    # tensor string （bytes tensor      string tensor）

    # necessary operation
    # bytes_list   to   uint8_list
    image=tf.decode_raw(features['image_raw'], tf.uint8) 
    #print(image)    # tensor uint8

    x=features['x']

    #y=tf.cast(features['y'], tf.string)
    y=features['y']

    label=tf.cast(features['label'], tf.int32)
    pixels=tf.cast(features['pixels'], tf.int32)

    #image.set_shape([pixels**0.5, pixels**0.5])
    image.set_shape([784])
    return image, label, pixels, x, y


# map()函數表示對資料集中的每一條資料進行調用解析方法。
dataset = dataset.map(parser)

# dataset 資料集操縱
dataset = dataset.shuffle(3).repeat(2).batch(2)

# 定義周遊資料集的疊代器。
#iterator = dataset.make_one_shot_iterator()
# 定義周遊dataset的initializable_iterator。
iterator = dataset.make_initializable_iterator()

# 讀取資料，可用于進一步計算
image, label, pixels, x, y = iterator.get_next()

with tf.Session() as sess:
    # 初始化變量。
    sess.run((tf.global_variables_initializer(),
              tf.local_variables_initializer()))

    # 首先初始化iterator，并給出input_files的值。
    sess.run(iterator.initializer)


    while True:
        try:
            print(sess.run([image, label, pixels, x, y]))
        except tf.errors.OutOfRangeError:
            break

注：

疊代器：

make_one_shot_iterator 方法不能重複初始化，即one-shot(一次性)，但是可以自動初始化。

tensorflow 資料集對象（tf.data）的使用( tf.data.Dataset 、tf.data.TextLineDataset

繼續閱讀

TensorFlow運作模型——會話

【Ubuntu-Tensorflow】TF1.0到TF1.2出現“Key LSTM/basic_lstm_cell/bias not found in checkpoin”問題

linux下的conda安裝tensorflow

Linux環境下 TensorFlow的安裝和使用基于Anaconda的tensorflow安裝

MindSpore儲存模型的格式疑惑

車道線檢測資料集

資料集 | 網絡釣魚網站資料集

【Tensorflow】Tensorflow介紹

資料集 | 金融反欺詐資料集

資料集 | 2021東京奧運會獎牌榜資料集

常見的error 1236 報錯

鸢尾花分類

利用tensorflow建構AlexNet模型，實作小數量級的貓狗分類（隻有train）

ImportError: libcublas.so.10.0: cannot open shared object file: No such file解決方法

ImportError: libcublas.so.9.0: cannot open shared object file: No such file or directory（完美解決）

一種解決思路： ImportError: libcublas.so.10.0: cannot open shared object file: No such file