GPU/TPC: Increace assumed cacheline size to 128 byte in cluster finder

fweig · fweig · commit bb9f0cc0ba65 · 2026-06-22T12:41:23.000+02:00
diff --git a/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h b/GPU/GPUTracking/TPCClusterFinder/CfArray2D.h
@@ -86,10 +86,13 @@ class LinearLayout
 template <tpccf::SizeT S>
 struct GridSize;
 
+// GridSize for 1 byte and 2 byte elements are adjusted for 128 byte cachelines,
+// as these are prevelant on modern GPUs.
+
 template <>
 struct GridSize<1> {
   enum {
-    Width = 8,
+    Width = 16,
     Height = 8,
   };
 };
@@ -98,10 +101,13 @@ template <>
 struct GridSize<2> {
   enum {
     Width = 8,
-    Height = 4,
+    Height = 8,
   };
 };
 
+// GridSize for 4 bytes is only used for MC indexing on CPU.
+// So assume 64 byte cachelines here instead.
+
 template <>
 struct GridSize<4> {
   enum {
diff --git a/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h b/GPU/GPUTracking/TPCClusterFinder/GPUTPCCFCheckPadBaseline.h
@@ -53,7 +53,6 @@ class GPUTPCCFCheckPadBaseline : public GPUKernelTemplate
     TimebinsPerCacheline = TPCMapMemoryLayout<uint16_t>::Height,
     EntriesPerCacheline = PadsPerCacheline * TimebinsPerCacheline,
     NumOfCachedPads = GPUCA_WARP_SIZE / TimebinsPerCacheline,
-    NumCLsPerWarp = GPUCA_WARP_SIZE / EntriesPerCacheline,
     NumOfCachedTBs = TimebinsPerCacheline * 8,
     // Threads index shared memory as [iThread / MaxNPadsPerRow][iThread % MaxNPadsPerRow].
     // Rounding up to a multiple of PadsPerCacheline ensures iThread / MaxNPadsPerRow < NumOfCachedTBs