is possible read cuda texture using floating point index directly, e.g. can perform texture fetch using tex.1d.v4.f32.f32
.
this appears save 2 instructions when looking @ .ptx
files , reflected in increased performance when benchmarking. however, rather critical downside that, while appears run without issue, not produce desired results.
the code below demonstrates issue:
#include "cuda.h" #include <thrust/device_vector.h> //create global 1d texture of type float texture<float, cudatexturetype1d, cudareadmodeelementtype> tex; //below hand rolled ptx texture lookup using tex.1d.v4.f32.f32 __device__ float tex_load(float idx) { float4 temp; asm("tex.1d.v4.f32.f32 {%0, %1, %2, %3}, [tex, {%4}];" : "=f"(temp.x), "=f"(temp.y), "=f"(temp.z), "=f"(temp.w) : "f"(idx)); return temp.x; } //try read texture using tex1dfetch , custom tex_load __global__ void read(){ float x = tex1dfetch(tex,0.0f); float y = tex_load(0.0f); printf("tex1dfetch: %f tex_load: %f\n",x,y); } int main() { //create vector of size 1 x[0]=3.14 thrust::device_vector<float> x(1,3.14); float* x_ptr = thrust::raw_pointer_cast(&x[0]); //bind texture cudabindtexture(0, tex, x_ptr, sizeof(float)); //launch single thread single block kernel read<<<1,1>>>(); cudaunbindtexture(tex); return 0; }
i've tried on couple of cards (k40, c2070) , couple of cuda versions (6.0,7.0), on same output:
tex1dfetch: 3.140000 tex_load: 0.000000
is possible or barking wrong tree?
your problem using unsupported instruction texture bound linear memory default cudareadmodeelementtype
read mode. if rewrite function this:
__device__ float tex_load(int idx) { float4 temp; asm("tex.1d.v4.f32.s32 {%0, %1, %2, %3}, [tex, {%4}];" : "=f"(temp.x), "=f"(temp.y), "=f"(temp.z), "=f"(temp.w) : "r"(idx)); return temp.x; }
ie. pass integer index texture unit, not float, think find work correctly. need have texture filtering read mode use tex.1d.v4.f32.f32
.
Comments
Post a Comment