Skip to content

No GPU/TPU found in RTX 4060 #19

@xbw445491720

Description

@xbw445491720

(bowen) bowen@LAPTOP-NHPO4HKH:~/cell$ python cell2fate-gpu.py
Global seed set to 0
CUDA available: True
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)
Leiden clustering ...
WARNING: You’re trying to run this on 2277 dimensions of .X, if you really want this, set use_rep='X'.
Falling back to preprocessing with sc.pp.pca and default params.
Number of Leiden Clusters: 10
Maximal Number of Modules: 11
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
Epoch 1/500: 0%| | 0/500 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/bowen/cell/cell2fate-gpu.py", line 166, in
main()
File "/home/bowen/cell/cell2fate-gpu.py", line 162, in main
run_cell2fate_analysis(adata_kpc, '/mnt/c/Users/Bowen/Desktop/kpc_cell2fate_analysis')
File "/home/bowen/cell/cell2fate-gpu.py", line 92, in run_cell2fate_analysis
mod.train(batch_size=32)
File "/home/bowen/cell2fate/cell2fate/_cell2fate_DynamicalModel.py", line 164, in train
super().train(**kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/model/base/_pyromixin.py", line 146, in train
return runner()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainrunner.py", line 74, in call
self.trainer.fit(self.training_plan, self.data_splitter)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainer.py", line 186, in fit
super().fit(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 740, in fit
self._call_and_handle_interrupt(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1199, in _run
self._dispatch()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 193, in advance
batch_output = self.batch_loop.run(batch, batch_idx)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 90, in advance
outputs = self.manual_loop.run(split_batch, batch_idx)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/loops/optimization/manual_loop.py", line 111, in advance
training_step_output = self.trainer.accelerator.training_step(step_kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/accelerators/accelerator.py", line 219, in training_step
return self.training_type_plugin.training_step(*step_kwargs.values())
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 213, in training_step
return self.model.training_step(*args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/scvi/train/_trainingplans.py", line 741, in training_step
loss = torch.Tensor([self.svi.step(*args, **kwargs)])
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/svi.py", line 145, in step
loss = self.loss_and_grads(self.model, self.guide, *args, **kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/trace_elbo.py", line 140, in loss_and_grads
for model_trace, guide_trace in self._get_traces(model, guide, args, kwargs):
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/elbo.py", line 237, in _get_traces
yield self._get_trace(model, guide, args, kwargs)
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/trace_elbo.py", line 57, in _get_trace
model_trace, guide_trace = get_importance_trace(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/infer/enum.py", line 75, in get_importance_trace
model_trace.compute_log_prob()
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/pyro/poutine/trace_struct.py", line 230, in compute_log_prob
log_p = site["fn"].log_prob(
File "/home/bowen/miniconda3/envs/bowen/lib/python3.9/site-packages/torch/distributions/gamma.py", line 71, in log_prob
self.rate * value - torch.lgamma(self.concentration))
RuntimeError: nvrtc: error: invalid value for --gpu-architecture (-arch)

#define POS_INFINITY __int_as_float(0x7f800000)
#define INFINITY POS_INFINITY
#define NEG_INFINITY __int_as_float(0xff800000)
#define NAN __int_as_float(0x7fffffff)

typedef long long int int64_t;
typedef unsigned int uint32_t;
typedef signed char int8_t;
typedef unsigned char uint8_t; // NOTE: this MUST be "unsigned char"! "char" is equivalent to "signed char"
typedef short int16_t;
static_assert(sizeof(int64_t) == 8, "expected size does not match");
static_assert(sizeof(uint32_t) == 4, "expected size does not match");
static_assert(sizeof(int8_t) == 1, "expected size does not match");
constexpr int num_threads = 128;
constexpr int thread_work_size = 4; // TODO: make template substitution once we decide where those vars live
constexpr int block_work_size = thread_work_size * num_threads;
//TODO use _assert_fail, because assert is disabled in non-debug builds
#define ERROR_UNSUPPORTED_CAST assert(false);

namespace std {

using ::signbit;
using ::isfinite;
using ::isinf;
using ::isnan;

using ::abs;

using ::acos;
using ::acosf;
using ::asin;
using ::asinf;
using ::atan;
using ::atanf;
using ::atan2;
using ::atan2f;
using ::ceil;
using ::ceilf;
using ::cos;
using ::cosf;
using ::cosh;
using ::coshf;

using ::exp;
using ::expf;

using ::fabs;
using ::fabsf;
using ::floor;
using ::floorf;

using ::fmod;
using ::fmodf;

using ::frexp;
using ::frexpf;
using ::ldexp;
using ::ldexpf;

using ::log;
using ::logf;

using ::log10;
using ::log10f;
using ::modf;
using ::modff;

using ::pow;
using ::powf;

using ::sin;
using ::sinf;
using ::sinh;
using ::sinhf;

using ::sqrt;
using ::sqrtf;
using ::tan;
using ::tanf;

using ::tanh;
using ::tanhf;

using ::acosh;
using ::acoshf;
using ::asinh;
using ::asinhf;
using ::atanh;
using ::atanhf;
using ::cbrt;
using ::cbrtf;

using ::copysign;
using ::copysignf;

using ::erf;
using ::erff;
using ::erfc;
using ::erfcf;
using ::exp2;
using ::exp2f;
using ::expm1;
using ::expm1f;
using ::fdim;
using ::fdimf;
using ::fmaf;
using ::fma;
using ::fmax;
using ::fmaxf;
using ::fmin;
using ::fminf;
using ::hypot;
using ::hypotf;
using ::ilogb;
using ::ilogbf;
using ::lgamma;
using ::lgammaf;
using ::llrint;
using ::llrintf;
using ::llround;
using ::llroundf;
using ::log1p;
using ::log1pf;
using ::log2;
using ::log2f;
using ::logb;
using ::logbf;
using ::lrint;
using ::lrintf;
using ::lround;
using ::lroundf;

using ::nan;
using ::nanf;

using ::nearbyint;
using ::nearbyintf;
using ::nextafter;
using ::nextafterf;
using ::remainder;
using ::remainderf;
using ::remquo;
using ::remquof;
using ::rint;
using ::rintf;
using ::round;
using ::roundf;
using ::scalbln;
using ::scalblnf;
using ::scalbn;
using ::scalbnf;
using ::tgamma;
using ::tgammaf;
using ::trunc;
using ::truncf;

} // namespace std

// NB: Order matters for this macro; it is relied upon in
// promoteTypesLookup and the serialization format.
// Note, some types have ctype as void because we don't support them in codegen
#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(
)
_(uint8_t, Byte) /* 0 /
_(int8_t, Char) /
1 /
_(int16_t, Short) /
2 /
_(int, Int) /
3 /
_(int64_t, Long) /
4 /
_(at::Half, Half) /
5 /
_(float, Float) /
6 /
_(double, Double) /
7 /
_(std::complexat::Half, ComplexHalf) /
8 /
_(std::complex, ComplexFloat) /
9 /
_(std::complex, ComplexDouble) /
10 /
_(bool, Bool) /
11 /
_(void, QInt8) /
12 /
_(void, QUInt8) /
13 /
_(void, QInt32) /
14 /
_(at::BFloat16, BFloat16) /
15 */ \

#define AT_FORALL_SCALAR_TYPES_WITH_COMPLEX_EXCEPT_COMPLEX_HALF(_)
_(uint8_t, Byte)
_(int8_t, Char)
_(int16_t, Short)
_(int, Int)
_(int64_t, Long)
_(at::Half, Half)
_(float, Float)
_(double, Double)
_(std::complex, ComplexFloat)
_(std::complex, ComplexDouble)
_(bool, Bool)
_(at::BFloat16, BFloat16)

enum class ScalarType : int8_t {
#define DEFINE_ENUM(_1, n) n,
AT_FORALL_SCALAR_TYPES_WITH_COMPLEX(DEFINE_ENUM)
#undef DEFINE_ENUM
Undefined,
NumOptions
};

template <typename T, int size>
struct Array {
T data[size];

device T operator[](int i) const {
return data[i];
}
device T& operator[](int i) {
return data[i];
}
Array() = default;
Array(const Array&) = default;
Array& operator=(const Array&) = default;
};

template
device inline scalar_t load(char* base_ptr, uint32_t offset) {
return (reinterpret_cast<scalar_t>(base_ptr) + offset);
}

template
device inline void store(scalar_t value, char *base_ptr, uint32_t offset) {
*(reinterpret_cast<scalar_t *>(base_ptr) + offset) = value;
}

// aligned vector generates vectorized load/store on CUDA
template<typename scalar_t, int vec_size>
struct alignas(sizeof(scalar_t) * vec_size) aligned_vector {
scalar_t val[vec_size];
};

template T lgamma_kernel(T a) { return lgamma(a); }

// TODO: setup grid-stride loop

extern "C" global
void lgamma_kernel_vectorized4_kernel(
const int N,
Array<char*, 1+1> data,
float scalar_val) //[1+1],
{
constexpr int vec_size = 4;
int remaining = N - block_work_size * blockIdx.x;
auto thread_idx = threadIdx.x;
int idx = blockIdx.x;
float arg0[4];

  float out[4];
  

  if (remaining < block_work_size) {
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++){
      if (thread_idx >= remaining) {
        break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      arg0[j] = load<float>(data[1], linear_idx);
      
      thread_idx += num_threads;
    }
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if ((threadIdx.x  + j*num_threads) < remaining) {
        out[j] = lgamma_kernel<float>(arg0[j] );
      }
    }
    thread_idx = threadIdx.x;
    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      if (thread_idx >= remaining) {
          break;
      }
      int linear_idx = thread_idx + block_work_size * idx;
      store<float>(out[j], data[0], linear_idx);
      thread_idx += num_threads;
    }
  } else {
    static constexpr int loop_size = thread_work_size / vec_size;

//actual loading
using vec_t_input = aligned_vector<float, vec_size>;
vec_t_input * vec0 = reinterpret_cast<vec_t_input *>(data[0+1]) + block_work_size / vec_size * idx;

    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_input v;
      v = vec0[thread_idx];
      #pragma unroll
      for (int j=0; j < vec_size; j++){
        arg0[vec_size * i + j] = v.val[j];
      }
      
      thread_idx += num_threads;
    }


    #pragma unroll
    for (int j = 0; j < thread_work_size; j++) {
      out[j] = lgamma_kernel<float>(arg0[j]);
    }
    using vec_t_output = aligned_vector<float, vec_size>;
    vec_t_output * to_ = reinterpret_cast<vec_t_output *>(data[0]) + block_work_size / vec_size * idx;
    int thread_idx = threadIdx.x;
    #pragma unroll
    for (int i = 0; i<loop_size; i++){
      vec_t_output v;
      #pragma unroll
      for (int j=0; j<vec_size; j++){
        v.val[j] = out[vec_size * i + j];
      }
      to_[thread_idx] = v;
      thread_idx += num_threads;
    }
  }

}

Epoch 1/500: 0%|

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions