We have a SLURM batch file that fails with TF2 and Keras, and also fails when called directly on a node that has a GPU. Here is the Python script contents:
from datetime import date import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.decomposition import PCA from sklearn.linear_model import LinearRegression import matplotlib.pyplot as plt from keras.models import Sequential from keras.layers import Dense, SimpleRNN from keras.optimizers import adam from keras.layers import Dropout from tensorflow.keras.callbacks import Callback, EarlyStopping from sklearn.preprocessing import StandardScaler from datetime import datetime, timedelta from sklearn.metrics import r2_score, mean_squared_error, accuracy_score from keras.layers.core import Dense, Dropout, Activation from keras.layers.recurrent import LSTM from keras.models import load_model from keras.callbacks import EarlyStopping, ModelCheckpoint import warnings import os os.environ['TF_CPP_MIN_LOG_LEVEL'] = "3" warnings.filterwarnings('ignore') import tensorflow as tf import logging logging.getLogger('tesorflow').setLevel(logging.FATAL) delay = 252 window = 60 factor = 15 K = 8.4 sbo = 1.25 sso = 1.25 sbc = 0.75 ssc = 0.5 r = 0.02 tran_cost = 0.0002 leverage = 1.0 start_val = 100 bo = 1 so = -1 X_pd=pd.read_pickle('./data/X_pd.pkl') X = pd.DataFrame(columns=range(0, window)) Y = [] for tag in X_pd.columns[:1]: # i=0 ....len(X_pd.index)-window for i in range(0, len(X_pd.index) - window): X_example = X_pd.loc[i:i + window - 1][tag].values X= X.append(pd.Series(X_example), ignore_index=True) Y.append(X_pd.loc[i + window][tag]) print('done %s stocks' % (tag)) Y=pd.DataFrame(Y) #normalization SS = StandardScaler() features = SS.fit_transform(X.values) X=features X=pd.DataFrame(X) #LSTM model def trainLSTMModel(layers, neurons, d): model = Sequential() model.add(LSTM(neurons[0], input_shape=(layers[1], layers[2]), return_sequences=False,activation='relu')) #model.add(Dropout(d)) #model.add(LSTM(neurons[1], input_shape=(layers[1], layers[2]), return_sequences=False)) #model.add(Dropout(d)) #model.add(Dense(neurons[2], kernel_initializer="uniform", activation='relu')) model.add(Dense(neurons[3], kernel_initializer="uniform", activation='relu')) optimizer=adam(learning_rate=0.001) #adam = Adam(decay=0.2) # predict up and down # model.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy']) model.compile(loss='mse', optimizer=optimizer) model.summary() return model length=X.shape[0] X=np.array(X) Y=np.array(Y) time_step = 60 d = 0.3 output=1 shape = [length,time_step, output] # feature, window, output neurons = [64, 64, 32, 1] epochs = 100 batch_size=10000 model = trainLSTMModel(shape, neurons, d) #shape from [samples, timesteps] into [samples, timesteps, features] n_features = 1 X = X.reshape((X.shape[0], X.shape[1], n_features)) gpu_no = 0 with tf.device('/gpu:' + str(gpu_no)): # sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) # keras.backend.set_session(sess) print('model_manager: running tensorflow version: ' + tf.__version__) print('model_manager: will attempt to run on ' + '/gpu:' + str(gpu_no)) model.fit(X, Y, epochs=epochs, verbose=2,batch_size=batch_size)
The log shows this:
Loading requirement: cuda10.1/toolkit/10.1.243 Loading cm-ml-python3deps/3.3.0 Loading requirement: gcc5/5.5.0 python36 Loading tensorflow2-py36-cuda10.1-gcc/2.0.0 Loading requirement: ml-pythondeps-py36-cuda10.1-gcc/3.3.0 openblas/dynamic/0.2.20 hdf5_18/1.8.20 keras-py36-cuda10.1-gcc/2.3.1 protobuf3-gcc/3.8.0 nccl2-cuda10.1-gcc/2.7.8 Loading openmpi/cuda/64/3.1.4 Loading requirement: hpcx/2.4.0 2021-08-18 11:11:43.064175: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1 2021-08-18 11:18:08.026219: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcuda.so.1 2021-08-18 11:18:08.031771: E tensorflow/stream_executor/cuda/cuda_driver.cc:318] failed call to cuInit: CUDA_ERROR_UNKNOWN: unknown error 2021-08-18 11:18:08.031811: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: node001 2021-08-18 11:18:08.031819: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: node001 2021-08-18 11:18:08.031921: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 460.73.1 2021-08-18 11:18:08.031958: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 460.73.1 2021-08-18 11:18:08.031966: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 460.73.1 2021-08-18 11:18:08.032266: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX512F Using TensorFlow backend. done A stocks Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= lstm_1 (LSTM) (None, 64) 16896 _________________________________________________________________ dense_1 (Dense) (None, 1) 65 ================================================================= Total params: 16,961 Trainable params: 16,961 Non-trainable params: 0 _________________________________________________________________ model_manager: running tensorflow version: 2.0.0 model_manager: will attempt to run on /gpu:0 Traceback (most recent call last): File "stocks.py", line 99, in <module> model.fit(X, Y, epochs=epochs, verbose=2,batch_size=batch_size) File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/engine/training.py", line 1213, in fit self._make_train_function() File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/engine/training.py", line 316, in _make_train_function loss=self.total_loss) File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/legacy/interfaces.py", line 91, in wrapper return func(*args, **kwargs) File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 75, in symbolic_fn_wrapper return func(*args, **kwargs) File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/optimizers.py", line 519, in get_updates for (i, p) in enumerate(params)] File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/optimizers.py", line 519, in <listcomp> for (i, p) in enumerate(params)] File "/cm/shared/apps/keras-py36-cuda10.1-gcc/2.3.1/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py", line 963, in zeros v = tf.zeros(shape=shape, dtype=dtype, name=name) File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py", line 2349, in zeros output = _constant_if_small(zero, shape, dtype, name) File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/ops/array_ops.py", line 2307, in _constant_if_small return constant(value, shape=shape, dtype=dtype, name=name) File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 227, in constant allow_broadcast=True) File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 235, in _constant_impl t = convert_to_eager_tensor(value, ctx, dtype) File "/cm/shared/apps/tensorflow2-py36-cuda10.1-gcc/2.0.0/lib/python3.6/site-packages/tensorflow_core/python/framework/constant_op.py", line 96, in convert_to_eager_tensor return ops.EagerTensor(value, ctx.device_name, dtype) RuntimeError: /job:localhost/replica:0/task:0/device:GPU:0 unknown device.
Why is the script not seeing the GPU?