import numpy as np import pandas as pd import tensorflow as tf from tensorflow import feature_column from tensorflow.keras import layers from sklearn.model_selection import train_test_split
data
URL = 'https://storage.googleapis.com/applied-dl/heart.csv' dataframe = pd.read_csv(URL) train, test = train_test_split(dataframe, test_size=0.2) train, val = train_test_split(train, test_size=0.2)
Use TF Data create input pipeline
# A method to create TF from Pandas Dataframe utility method of data dataset def df_to_dataset(dataframe, shuffle=True, batch_size=32): dataframe = dataframe.copy() labels = dataframe.pop('target') ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)) if shuffle: ds = ds.shuffle(buffer_size=len(dataframe)) ds = ds.batch(batch_size) return ds
Characteristic column
TensorFlow provides a variety of features listed in the feature_ Under the column module, several types of commonly used feature columns are created below. It should be demonstrated how the feature columns convert the columns in the dataframe.
from tensorflow import feature_column
A utility method for creating a characteristic column and converting a batch of data:
# Use this batch data to demonstrate several characteristic columns example_batch = next(iter(train_ds))[0] def demo(feature_column): feature_layer = layers.DenseFeatures(feature_column) print(feature_layer(example_batch).numpy())
1. Drum column
Numerical discretization, for example: divide the age into several buckets
age_buckets = feature_column.bucketized_column(age,boundaries = [18,25,30,35,40,45,50,55,60,65]) demo(age_buckets)
2. Classification column
In this dataset, thal is represented by a string (e.g. 'fixed', 'normal', or 'reversible'). We cannot provide strings directly to the model. Instead, we must first map them to values. categorical vocabulary columns provide a way to represent strings with one hot vectors. Vocabulary can be classified_ column_ with_ vocabulary_ List is passed as a list, or category_ column_ with_ vocabulary_ File load from file
thal = feature_column.categorical_column_with_vocabulary_list('thal',['fixed', 'normal', 'reversible']) thal_one_hot = feature_column.indicator_column(thal) demo(thal_one_hot)
3.1 embedded columns
Suppose we don't have only a few possible strings, but each category has thousands (or more) of values. For many reasons, with the increase of the number of categories, using one hot coding to train neural networks becomes infeasible. We can use embedding examples to overcome this limitation. The embedded column represents the data as a low dimensional dense vector instead of a multi-dimensional one hot vector. The low dimensional dense vector can contain any number instead of 0 or 1. The embedded size is a parameter that must be adjusted.
# When a classification column has many possible values, it is best to use embedded columns thal_embedding = feature_column.embedding_column(thal, dimension=8) demo(thal_embedding)
3.2 hashed feature columns
Another way to represent a categorical column with a large number of values is to use categorical_column_with_hash_bucket. The feature column calculates an input hash value, and then selects a hash_bucket_size encodes strings in buckets. When using this column, you do not need to provide a vocabulary, and you can choose to make hash_ The number of buckets is much smaller than the actual number of categories to save space.
**Note: * * an important disadvantage of this technique is that there may be conflicts, and different strings are mapped to the same range. In fact, in any case, the hashed feature columns are valid for some data sets.
thal_hashed = feature_column.categorical_column_with_hash_bucket('thal',hash_bucket_size = 1000) demo(feature_column.indicator_column(thal_hashed))
4. Combined characteristic column
Combining multiple features into one feature is called feature crosses, which enables the model to learn individual weights for each feature combination. In addition, we will create a new feature that combines age and thal. Please note that crossed_column does not build a complete list of all possible combinations (which can be very large). Instead, it is hashed_column support. You can select the size of the table.
crossed_feature = feature_column.crossed_column([age_buckets,thal],hash_bucket_size=1000) demo(feature_column,indicator_column(crossed_feature))
Select the column to use and train the model
feature_columns = [] # Numeric column for header in ['age','tresbps','chol','thalach','oldpeak','ca']: feature_columns.append(feature_column.numeric_column(header)) # Barrel column age_buckets = feature_column.bucketized_column(age, boundaries = [18,25,30,35,40,45,50,55,60,65]) feature_column.append(age_buckets) # Classification column thal =feature_column.categorical_column_with_vocabulary_list('thal',['fixed','normal','reversible']) thal_one_hot = feature_column.indicator_column(thal) feature_columns.append(thal_one_hot) #Embedded column thal_embedding = feature_column.embedding_column(thal,dimension =8) feature_columns.append(thal_embedding) # Combined column crossed_feature = feature_column.crossed_column([age_buckets,thal],hash_bucket_size = 1000) crossed_feature = feature_column.indicator_column(crossed_feature) feature_columns.append(crossed_feature)
Create a new feature layer
We have defined the feature column above. We need to use the dense features layer to input the feature column into the Keras model.
feature_layer = tf.keras.layers.DenseFeatures(feature_columns)
Training data set
batch_size = 32 train_ds = df_to_dataset(train, batch_size= batch_size) val_ds = df_to_dataset(val, shuffle = False, batch_size=batch_size) test_ds = df_to_dataset(test, shuffle = False, batch_size= batch_size)
Create, compile and train models
model = tf.keras.Sequential([ feature_layer, layers.Dense(128, activation = 'relu'), layers.Dense(128, activation = 'relu'), layers.Dense(1, activation ='sigmoid') ]) model.compile( optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'], run_early=True) model.fit(train_ds, validation_data= val_ds,epochs =5)