-
Notifications
You must be signed in to change notification settings - Fork 9
Description
(bowen) bowen@LAPTOP-NHPO4HKH:~/cell$ python cell2fate-gpu.py
Global seed set to 0
CUDA available: True
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
Leiden clustering ...
WARNING: You’re trying to run this on 2277 dimensions of .X, if you really want this, set use_rep='X'.
Falling back to preprocessing with sc.pp.pca and default params.
Number of Leiden Clusters: 10
Maximal Number of Modules: 11
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 1/500: 0%| | 0/500 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/bowen/cell/cell2fate-gpu.py", line 166, in
main()
File "/home/bowen/cell/cell2fate-gpu.py", line 162, in main
run_cell2fate_analysis(adata_kpc, '/mnt/c/Users/Bowen/Desktop/kpc_cell2fate_analysis')
File "/home/bowen/cell/cell2fate-gpu.py", line 92, in run_cell2fate_analysis
mod.train(batch_size=32)
File "/home/bowen/cell2fate/cell2fate/_cell2fate_DynamicalModel.py", line 164, in train
super().train(**kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/model/base/_pyromixin.py", line 146, in train
return runner()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainrunner.py", line 74, in call
self.trainer.fit(self.training_plan, self.data_splitter)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainer.py", line 186, in fit
super().fit(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 740, in fit
self._call_and_handle_interrupt(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1199, in _run
self._dispatch()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 193, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 90, in advance
outputs = self.manual_loop.run(split_batch, batch_idx)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/manual_loop.py", line 111, in advance
training_step_output = self.trainer.accelerator.training_step(step_kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 219, in training_step
return self.training_type_plugin.training_step(*step_kwargs.values())
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 213, in training_step
return self.model.training_step(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainingplans.py", line 741, in training_step
loss = torch.Tensor([self.svi.step(*args, **kwargs)])
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/svi.py", line 145, in step
loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/trace_elbo.py", line 140, in loss_and_grads
for model_trace, guide_trace in self._get_traces(model, guide, args, kwargs):
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/elbo.py", line 237, in _get_traces
yield self._get_trace(model, guide, args, kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/trace_elbo.py", line 57, in _get_trace
model_trace, guide_trace = get_importance_trace(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/enum.py", line 75, in get_importance_trace
model_trace.compute_log_prob()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/poutine/trace_struct.py", line 230, in compute_log_prob
log_p = site["fn"].log_prob(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/torch/distributions/gamma.py", line 71, in log_prob
self.rate * value - torch.lgamma(self.concentration))
RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)
#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)
typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);
namespace std {
using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;
using ::abs;
using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;
using ::exp;
using ::expf;
using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;
using ::fmod;
using ::fmodf;
using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;
using ::log;
using ::logf;
using ::log10;
using ::log10f;
using ::modf;
using ::modff;
using ::pow;
using ::powf;
using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;
using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;
using ::tanh;
using ::tanhf;
using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;
using ::copysign;
using ::copysignf;
using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;
using ::nan;
using ::nanf;
using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;
} // namespace std
// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX()
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) / 1 /
_(int16_t, Short) / 2 /
_(int, Int) / 3 /
_(int64_t, Long) / 4 /
_(at::Half, Half) / 5 /
_(float, Float) / 6 /
_(double, Double) / 7 /
_(std::complexat::Half, ComplexHalf) / 8 /
_(std::complex, ComplexFloat) / 9 /
_(std::complex, ComplexDouble) / 10 /
_(bool, Bool) / 11 /
_(void, QInt8) / 12 /
_(void, QUInt8) / 13 /
_(void, QInt32) / 14 /
_(at::BFloat16, BFloat16) / 15 */ \
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_)
_(uint8_t, Byte)
_(int8_t, Char)
_(int16_t, Short)
_(int, Int)
_(int64_t, Long)
_(at::Half, Half)
_(float, Float)
_(double, Double)
_(std::complex, ComplexFloat)
_(std::complex, ComplexDouble)
_(bool, Bool)
_(at::BFloat16, BFloat16)
enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};
template <typename T, int size>
struct Array {
T data[size];
device T operator[](int i) const {
return data[i];
}
device T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
};
template
device inline scalar_t load(char* base_ptr, uint32_t offset) {
return (reinterpret_cast<scalar_t>(base_ptr) + offset);
}
template
device inline void store(scalar_t value, char *base_ptr, uint32_t offset) {
*(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
}
// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};
template T lgamma_kernel(T a) { return lgamma(a); }
// TODO: setup grid-stride loop
extern "C" global
void lgamma_kernel_vectorized4_kernel(
const int N,
Array<char*, 1+1> data,
float scalar_val) //[1+1],
{
constexpr int vec_size = 4;
int remaining = N - block_work_size * blockIdx.x;
auto thread_idx = threadIdx.x;
int idx = blockIdx.x;
float arg0[4];
float out[4];
if (remaining < block_work_size) {
#pragma unroll
for (int j = 0; j < thread_work_size; j++){
if (thread_idx >= remaining) {
break;
}
int linear_idx = thread_idx + block_work_size * idx;
arg0[j] = load<float>(data[1], linear_idx);
thread_idx += num_threads;
}
#pragma unroll
for (int j = 0; j < thread_work_size; j++) {
if ((threadIdx.x + j*num_threads) < remaining) {
out[j] = lgamma_kernel<float>(arg0[j] );
}
}
thread_idx = threadIdx.x;
#pragma unroll
for (int j = 0; j < thread_work_size; j++) {
if (thread_idx >= remaining) {
break;
}
int linear_idx = thread_idx + block_work_size * idx;
store<float>(out[j], data[0], linear_idx);
thread_idx += num_threads;
}
} else {
static constexpr int loop_size = thread_work_size / vec_size;
//actual loading
using vec_t_input = aligned_vector<float, vec_size>;
vec_t_input * vec0 = reinterpret_cast<vec_t_input *>(data[0+1]) + block_work_size / vec_size * idx;
#pragma unroll
for (int i = 0; i<loop_size; i++){
vec_t_input v;
v = vec0[thread_idx];
#pragma unroll
for (int j=0; j < vec_size; j++){
arg0[vec_size * i + j] = v.val[j];
}
thread_idx += num_threads;
}
#pragma unroll
for (int j = 0; j < thread_work_size; j++) {
out[j] = lgamma_kernel<float>(arg0[j]);
}
using vec_t_output = aligned_vector<float, vec_size>;
vec_t_output * to_ = reinterpret_cast<vec_t_output *>(data[0]) + block_work_size / vec_size * idx;
int thread_idx = threadIdx.x;
#pragma unroll
for (int i = 0; i<loop_size; i++){
vec_t_output v;
#pragma unroll
for (int j=0; j<vec_size; j++){
v.val[j] = out[vec_size * i + j];
}
to_[thread_idx] = v;
thread_idx += num_threads;
}
}
}
Epoch 1/500: 0%|