TrioCFD 1.9.8
TrioCFD documentation
Loading...
Searching...
No Matches
Device.cpp
1/****************************************************************************
2* Copyright (c) 2026, CEA
3* All rights reserved.
4*
5* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9*
10* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
11* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13*
14*****************************************************************************/
15
16#include <TRUSTArray.h>
17#include <Device.h>
18#ifdef TRUST_USE_GPU
19#include <DeviceMemory.h>
20#endif
21#include <ctime>
22#include <string>
23#include <sstream>
24#include <map>
25#include <tuple>
26#include <chrono>
27
28#ifndef LATATOOLS
29#include <Perf_counters.h>
30#include <kokkos++.h>
31#include <comm_incl.h>
32#include <Comm_Group_MPI.h>
33#include <PE_Groups.h>
34#endif
35
36/*
37bool init_device_ = false;
38bool clock_on = false;
39bool fence = true;
40double clock_start;
41int timer_counter=0;
42#ifdef TRUST_USE_GPU
43bool timer = true;
44#else
45bool timer = false;
46#endif
47*/
48
49std::string ptrToString(const void* adr)
50{
51 std::stringstream ss;
52 ss << adr;
53 return ss.str();
54}
55
56// Voir AmgXWrapper (src/init.cpp)
57int AmgXWrapperScheduling(int rank, int nRanks, int nDevs)
58{
59 int devID;
60 if (nRanks <= nDevs) // Less process than devices
61 devID = rank;
62 else // More processes than devices
63 {
64 int nBasic = nRanks / nDevs,
65 nRemain = nRanks % nDevs;
66 if (rank < (nBasic+1)*nRemain)
67 devID = rank / (nBasic + 1);
68 else
69 devID = (rank - (nBasic+1)*nRemain) / nBasic + nRemain;
70 }
71 return devID;
72}
73
74
75#ifdef TRUST_USE_GPU
76void init_device()
77{
78 if (statistics().get_init_device()) return;
79 statistics().set_init_device(true);
80 if (getenv("TRUST_CLOCK_ON")!= nullptr) statistics().set_gpu_verbose(true);
81 if (getenv("TRUST_DISABLE_FENCE")!=nullptr) statistics().set_gpu_fence(false);
82 Process::imprimer_ram_totale(); // Impression avant copie des donnees sur GPU
83}
84#endif
85
86#ifndef LATATOOLS
87#ifdef TRUST_USE_CUDA
88#include <cuda_runtime.h>
89void init_cuda()
90{
91 // Necessaire sur JeanZay pour utiliser GPU Direct (http://www.idris.fr/jean-zay/gpu/jean-zay-gpu-mpi-cuda-aware-gpudirect.html)
92 // mais performances moins bonnes (trust PAR_gpu_3D 2) donc desactive en attendant d'autres tests:
93 // Absolument necessaire sur JeanZay (si OpenMPU-Cuda car sinon plantages lors des IO)
94 // Voir: https://www.open-mpi.org/faq/?category=runcuda#mpi-cuda-aware-support pour activer ou non a la compilation !
95#if defined(MPIX_CUDA_AWARE_SUPPORT) && MPIX_CUDA_AWARE_SUPPORT
96 char* local_rank_env;
97 cudaError_t cudaRet;
98 /* Recuperation du rang local du processus via la variable d'environnement
99 positionnee par Slurm, l'utilisation de MPI_Comm_rank n'etant pas encore
100 possible puisque cette routine est utilisee AVANT l'initialisation de MPI */
101 // ToDo pourrait etre appelee plus tard dans AmgX ou PETSc GPU...
102 local_rank_env = getenv("SLURM_LOCALID");
103 if (local_rank_env)
104 {
105 int rank = atoi(local_rank_env);
106 int nRanks = atoi(getenv("SLURM_NTASKS"));
107 if (rank==0) printf("The MPI library has CUDA-aware support and TRUST will try using this feature...\n");
108 /* Definition du GPU a utiliser pour chaque processus MPI */
109 int nDevs = 0;
110 cudaGetDeviceCount(&nDevs);
111 int devID = AmgXWrapperScheduling(rank, nRanks, nDevs);
112 cudaRet = cudaSetDevice(devID);
113 if(cudaRet != cudaSuccess)
114 {
115 printf("Error: cudaSetDevice failed\n");
116 abort();
117 }
118 else
119 {
120 if (rank==0) printf("init_cuda() done!");
121 cerr << "[MPI] Assigning rank " << rank << " to device " << devID << endl;
122 }
123 }
124 else
125 {
126 printf("Error : can't guess the local rank of the task\n");
127 abort();
128 }
129#endif /* MPIX_CUDA_AWARE_SUPPORT */
130}
131#endif /* TRUST_USE_CUDA */
132#endif /* LATATOOLS */
133
134// Address on device (return host adress if no device):
135template <typename _TYPE_>
136_TYPE_* addrOnDevice(_TYPE_* ptr)
137{
138#ifdef TRUST_USE_GPU
139 _TYPE_ *device_ptr = static_cast<_TYPE_*>(DeviceMemory::addrOnDevice(ptr));
140 return device_ptr;
141#else
142 return ptr;
143#endif
144}
145template <typename _TYPE_, typename _SIZE_>
146_TYPE_* addrOnDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
147{
148#ifdef TRUST_USE_GPU
149 if (tab.get_data_location()==DataLocation::HostOnly) return tab.data();
150 else return addrOnDevice(tab.data());
151#else
152 return tab.data();
153#endif
154}
155
156// Allocated ?
157template <typename _TYPE_>
158bool isAllocatedOnDevice(_TYPE_* tab_addr)
159{
160#ifdef TRUST_USE_GPU
161 return DeviceMemory::isAllocatedOnDevice(tab_addr);
162#else
163 return false;
164#endif
165}
166
167template <typename _TYPE_, typename _SIZE_>
168bool isAllocatedOnDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
169{
170#ifdef TRUST_USE_GPU
171 bool isAllocatedOnDevice1 = (tab.get_data_location() != DataLocation::HostOnly);
172 bool isAllocatedOnDevice2 = isAllocatedOnDevice(tab.data());
173 if (isAllocatedOnDevice1!=isAllocatedOnDevice2)
174 Process::exit("isAllocatedOnDevice(TRUSTArray<_TYPE_>& tab) error! Seems tab.get_data_location() is not up-to-date !");
175 return isAllocatedOnDevice2;
176#else
177 return false;
178#endif
179}
180
181// Allocate on device:
182template <typename _TYPE_, typename _SIZE_>
183_TYPE_* allocateOnDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
184{
185 _TYPE_ *tab_addr = tab.data();
186#ifdef TRUST_USE_GPU
187 if (isAllocatedOnDevice(tab)) deleteOnDevice(tab);
188 allocateOnDevice(tab_addr, tab.size_mem());
189 tab.set_data_location(DataLocation::Device);
190#endif
191 return tab_addr;
192}
193
194template <typename _TYPE_, typename _SIZE_>
195_TYPE_* allocateOnDevice(_TYPE_* ptr, _SIZE_ size)
196{
197#ifdef TRUST_USE_GPU
198 assert(!isAllocatedOnDevice(ptr)); // Verifie que la zone n'est pas deja allouee
199 statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1);
200 size_t bytes = sizeof(_TYPE_) * size;
201 size_t free_bytes = DeviceMemory::deviceMemGetInfo(0);
202 size_t total_bytes = DeviceMemory::deviceMemGetInfo(1);
203 if (bytes>free_bytes)
204 {
205 Cerr << "Error ! Trying to allocate " << bytes << " bytes on GPU memory whereas only " << free_bytes << " bytes are available." << finl;
207 }
208 _TYPE_* device_ptr = static_cast<_TYPE_*>(Kokkos::kokkos_malloc(bytes));
209 // Map host_ptr with device_ptr:
210 DeviceMemory::add(ptr, device_ptr, size * sizeof(_TYPE_));
211 if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre())
212 {
213 std::string clock(Process::is_parallel() ? "[clock]#"+std::to_string(Process::me()) : "[clock] ");
214 double ms = 1000 * statistics().get_time_since_last_open(STD_COUNTERS::gpu_malloc_free);
215 printf("%s %7.3f ms [Data] Allocate on device [%9s] %6ld Bytes (%ld/%ldGB free) Currently allocated: %6ld\n", clock.c_str(), ms, ptrToString(ptr).c_str(), long(bytes), free_bytes/(1024*1024*1024), total_bytes/(1024*1024*1024), long(DeviceMemory::allocatedBytesOnDevice()));
216 }
217 statistics().end_count(STD_COUNTERS::gpu_malloc_free);
218#ifndef NDEBUG
219 const _TYPE_ INVALIDE_ = (std::is_same<_TYPE_,double>::value) ? DMAXFLOAT*0.999 : ( (std::is_same<_TYPE_,int>::value) ? INT_MIN : 0); // Identique a TRUSTArray<_TYPE_>::fill_default_value()
220 Kokkos::View<_TYPE_*> ptr_v(device_ptr, size);
221 Kokkos::parallel_for(start_gpu_timer(__KERNEL_NAME__), size, KOKKOS_LAMBDA(const int i)
222 {
223 ptr_v(i) = INVALIDE_;
224 });
225 end_gpu_timer(__KERNEL_NAME__);
226#endif
227#endif
228 return ptr;
229}
230
231// Delete on device:
232template <typename _TYPE_, typename _SIZE_>
233void deleteOnDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
234{
235#ifdef TRUST_USE_GPU
236 _TYPE_ *tab_addr = tab.data();
237 if (statistics().get_init_device() && tab_addr && isAllocatedOnDevice(tab))
238 {
239 deleteOnDevice(tab_addr, tab.size_mem());
240 tab.set_data_location(DataLocation::HostOnly);
241 }
242#endif
243}
244
245template <typename _TYPE_, typename _SIZE_>
246void deleteOnDevice(_TYPE_* ptr, _SIZE_ size)
247{
248#ifdef TRUST_USE_GPU
249 statistics().begin_count(STD_COUNTERS::gpu_malloc_free,statistics().get_last_opened_counter_level()+1);
250 std::string clock;
251 if (PE_Groups::get_nb_groups()>0 && Process::is_parallel()) clock = "[clock]#"+std::to_string(Process::me());
252 else
253 clock = "[clock] ";
254 _SIZE_ bytes = sizeof(_TYPE_) * size;
255 if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre())
256 cout << clock << " [Data] Delete on device array [" << ptrToString(ptr).c_str() << "] of " << bytes << " Bytes. It remains " << DeviceMemory::getMemoryMap().size()-1 << " arrays." << endl << flush;
257 Kokkos::kokkos_free(addrOnDevice(ptr));
259 statistics().end_count(STD_COUNTERS::gpu_malloc_free);
260#endif
261}
262
263// Update const array on device if necessary
264// Before After Copy ?
265// HostOnly HostDevice Yes
266// Host HostDevice Yes
267// HostDevice HostDevice No
268// Device Device No
269template <typename _TYPE_, typename _SIZE_>
270const _TYPE_* mapToDevice(const TRUSTArray<_TYPE_,_SIZE_>& tab)
271{
272 // Update data on device if necessary
273 DataLocation loc = tab.isDataOnDevice() ? tab.get_data_location() : DataLocation::HostDevice;
274 const _TYPE_ *tab_addr = mapToDevice_(const_cast<TRUSTArray<_TYPE_,_SIZE_> &>(tab), loc);
275 return tab_addr;
276}
277
278template <typename _TYPE_, typename _SIZE_>
279_TYPE_* mapToDevice_(TRUSTArray<_TYPE_,_SIZE_>& tab, DataLocation nextLocation)
280{
281 _TYPE_ *tab_addr = tab.data();
282#ifdef TRUST_USE_GPU
283 DataLocation currentLocation = tab.get_data_location();
284 tab.set_data_location(nextLocation); // Important de specifier le nouveau status avant la recuperation du pointeur:
285 // Important for ref_array/ref_tab support, we take the size of the memory allocated, not the size of the array (tab.size_array()):
286 //int memory_size = tab.size_array();
287 int memory_size = tab.size_mem();
288 if (currentLocation==DataLocation::HostOnly)
289 {
290 // Not a Trav which is already allocated on device:
291 if (!(tab.get_mem_storage() == STORAGE::TEMP_STORAGE && isAllocatedOnDevice(tab_addr)))
292 allocateOnDevice(tab_addr, memory_size);
293 copyToDevice(tab_addr, memory_size);
294 }
295 else if (currentLocation==DataLocation::Host)
296 {
297 copyToDevice(tab_addr, memory_size);
298 if (DeviceMemory::warning(memory_size)) // Warning for large array only:
299 ToDo_Kokkos("H2D update of large array! Add a breakpoint to find the reason.");
300 }
301 else if (currentLocation==DataLocation::PartialHostDevice)
302 Process::exit("Error, can't map on device an array with PartialHostDevice status!");
303#endif
304 return tab_addr;
305}
306
307template <typename _TYPE_, typename _SIZE_>
308void copyToDevice(_TYPE_* ptr, _SIZE_ size)
309{
310#ifdef TRUST_USE_GPU
311 if (size>0)
312 {
313 assert(isAllocatedOnDevice(ptr));
314 _SIZE_ bytes = sizeof(_TYPE_) * size;
315 start_gpu_timer("copyToDevice",bytes);
316 statistics().begin_count(STD_COUNTERS::gpu_copytodevice,statistics().get_last_opened_counter_level()+1);
317 Kokkos::View<_TYPE_*> host_view(ptr, size);
318 Kokkos::View<_TYPE_*> device_view(addrOnDevice(ptr), size);
319 Kokkos::deep_copy(device_view, host_view);
320 statistics().end_count(STD_COUNTERS::gpu_copytodevice,1,static_cast<int>(bytes));
321 std::stringstream message;
322 message << "Copy to device [" << ptrToString(ptr) << "] " << size << " items ";
323 end_gpu_timer(message.str(), 0, bytes);
324 }
325#endif
326}
327
328// Copy non-const array on device if necessary for computation on device
329template <typename _TYPE_, typename _SIZE_>
330_TYPE_* computeOnTheDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
331{
332 // non-const array will be modified on device:
333 _TYPE_ *tab_addr = mapToDevice_(tab, DataLocation::Device);
334 return tab_addr;
335}
336
337// ToDo OpenMP: rename copy -> update or map ?
338// Copy non-const array to host from device
339template <typename _TYPE_, typename _SIZE_>
340void copyFromDevice(TRUSTArray<_TYPE_,_SIZE_>& tab)
341{
342#ifdef TRUST_USE_GPU
343 if (tab.get_data_location() == DataLocation::Device)
344 {
345 copyFromDevice(tab.data(), tab.size_mem());
346 tab.set_data_location(DataLocation::HostDevice);
347 }
348#endif
349}
350template <typename _TYPE_, typename _SIZE_>
351void copyFromDevice(_TYPE_* ptr, _SIZE_ size)
352{
353#ifdef TRUST_USE_GPU
354 if (size>0)
355 {
356 assert(isAllocatedOnDevice(ptr));
357 _SIZE_ bytes = sizeof(_TYPE_) * size;
358 start_gpu_timer("copyFromDevice",bytes);
359 statistics().begin_count(STD_COUNTERS::gpu_copyfromdevice,statistics().get_last_opened_counter_level()+1);
360 Kokkos::View<_TYPE_*> host_view(ptr, size);
361 Kokkos::View<_TYPE_*> device_view(addrOnDevice(ptr), size);
362 Kokkos::deep_copy(host_view, device_view);
363 statistics().end_count(STD_COUNTERS::gpu_copyfromdevice,1,static_cast<int>(bytes));
364 std::stringstream message;
365 message << "Copy from device [" << ptrToString(ptr) << "] " << size << " items ";
366 end_gpu_timer(message.str(), 0, bytes);
367 //if (statistics().is_gpu_verbose_on()) printf("\n");
368 if (DeviceMemory::warning(size)) // Warning for large array only:
369 ToDo_Kokkos("D2H update of large array! Add a breakpoint to find the reason if not IO.");
370 }
371#endif
372}
373
374// Copy const array to host from device
375template <typename _TYPE_, typename _SIZE_>
376void copyFromDevice(const TRUSTArray<_TYPE_,_SIZE_>& tab)
377{
378 copyFromDevice(const_cast<TRUSTArray<_TYPE_,_SIZE_>&>(tab));
379}
380
381
382//
383// Explicit template instanciations
384//
385template char* addrOnDevice<char>(char* ptr);
386template double* addrOnDevice<double>(TRUSTArray<double>& tab);
387template int* addrOnDevice<int>(TRUSTArray<int>& tab);
388template float* addrOnDevice<float>(TRUSTArray<float>& tab);
389
390template bool isAllocatedOnDevice<double>(double* tab_addr);
391template bool isAllocatedOnDevice<int>(int* tab_addr);
392template bool isAllocatedOnDevice<float>(float* tab_addr);
393template bool isAllocatedOnDevice<double>(TRUSTArray<double>& tab);
394template bool isAllocatedOnDevice<int>(TRUSTArray<int>& tab);
395template bool isAllocatedOnDevice<float>(TRUSTArray<float>& tab);
396
397template double* allocateOnDevice<double>(TRUSTArray<double>& tab);
398template int* allocateOnDevice<int>(TRUSTArray<int>& tab);
399template float* allocateOnDevice<float>(TRUSTArray<float>& tab);
400template char* allocateOnDevice<char>(char* ptr, int size);
401
402template const double* allocateOnDevice<double>(const TRUSTArray<double>& tab);
403template const int* allocateOnDevice<int>(const TRUSTArray<int>& tab);
404template const float* allocateOnDevice<float>(const TRUSTArray<float>& tab);
405
406template void deleteOnDevice<double>(TRUSTArray<double>& tab);
407template void deleteOnDevice<int>(TRUSTArray<int>& tab);
408template void deleteOnDevice<float>(TRUSTArray<float>& tab);
409template void deleteOnDevice<char>(char* ptr, int size);
410template void deleteOnDevice<int>(int* ptr, int size);
411template void deleteOnDevice<float>(float* ptr, int size);
412template void deleteOnDevice<double>(double* ptr, int size);
413
414template const double* mapToDevice<double>(const TRUSTArray<double>& tab);
415template const int* mapToDevice<int>(const TRUSTArray<int>& tab);
416template const float* mapToDevice<float>(const TRUSTArray<float>& tab);
417template void copyToDevice<char>(char* ptr, int size);
418
419template double* mapToDevice_<double>(TRUSTArray<double>& tab, DataLocation nextLocation);
420template int* mapToDevice_<int>(TRUSTArray<int>& tab, DataLocation nextLocation);
421template float* mapToDevice_<float>(TRUSTArray<float>& tab, DataLocation nextLocation);
422
423template double* computeOnTheDevice<double>(TRUSTArray<double>& tab);
424template int* computeOnTheDevice<int>(TRUSTArray<int>& tab);
425template float* computeOnTheDevice<float>(TRUSTArray<float>& tab);
426
427template void copyFromDevice<double>(TRUSTArray<double>& tab);
428template void copyFromDevice<int>(TRUSTArray<int>& tab);
429template void copyFromDevice<float>(TRUSTArray<float>& tab);
430template void copyFromDevice<char>(char* ptr, int size);
431
432template void copyFromDevice<double>(const TRUSTArray<double>& tab);
433template void copyFromDevice<int>(const TRUSTArray<int>& tab);
434template void copyFromDevice<float>(const TRUSTArray<float>& tab);
435
436#if INT_is_64_ == 2
437
438// The ones needed for 64 bits:
439template double* addrOnDevice<double>(TRUSTArray<double,trustIdType>& tab);
440template int* addrOnDevice<int>(TRUSTArray<int,trustIdType>& tab);
441template trustIdType* addrOnDevice<trustIdType>(TRUSTArray<trustIdType,trustIdType>& tab);
442template trustIdType* addrOnDevice<trustIdType>(TRUSTArray<trustIdType,int>& tab);
443template float* addrOnDevice<float>(TRUSTArray<float,trustIdType>& tab);
444
445template bool isAllocatedOnDevice<trustIdType>(trustIdType* tab_addr);
446template bool isAllocatedOnDevice<double>(TRUSTArray<double,trustIdType>& tab);
447template bool isAllocatedOnDevice<float>(TRUSTArray<float,trustIdType>& tab);
448template bool isAllocatedOnDevice<int>(TRUSTArray<int,trustIdType>& tab);
449template bool isAllocatedOnDevice<trustIdType>(TRUSTArray<trustIdType,trustIdType>& tab);
450template bool isAllocatedOnDevice<trustIdType>(TRUSTArray<trustIdType,int>& tab);
451
452template double* allocateOnDevice<double>(TRUSTArray<double,trustIdType>& tab);
453template int* allocateOnDevice<int>(TRUSTArray<int,trustIdType>& tab);
454template trustIdType* allocateOnDevice<trustIdType>(TRUSTArray<trustIdType,trustIdType>& tab);
455template trustIdType* allocateOnDevice<trustIdType>(TRUSTArray<trustIdType,int>& tab);
456template float* allocateOnDevice<float>(TRUSTArray<float,trustIdType>& tab);
457
458template const double* allocateOnDevice<double>(const TRUSTArray<double,trustIdType>& tab);
459template const int* allocateOnDevice<int>(const TRUSTArray<int,trustIdType>& tab);
460template const trustIdType* allocateOnDevice<trustIdType>(const TRUSTArray<trustIdType,trustIdType>& tab);
461template const trustIdType* allocateOnDevice<trustIdType>(const TRUSTArray<trustIdType,int>& tab);
462template const float* allocateOnDevice<float>(const TRUSTArray<float,trustIdType>& tab);
463
464template void deleteOnDevice<double>(TRUSTArray<double,trustIdType>& tab);
465template void deleteOnDevice<int>(TRUSTArray<int,trustIdType>& tab);
466template void deleteOnDevice<trustIdType>(TRUSTArray<trustIdType,trustIdType>& tab);
467template void deleteOnDevice<trustIdType>(TRUSTArray<trustIdType,int>& tab);
468template void deleteOnDevice<float>(TRUSTArray<float,trustIdType>& tab);
469
470template void deleteOnDevice<int>(int* ptr, long size);
471template void deleteOnDevice<int>(int* ptr, long long size);
472template void deleteOnDevice<trustIdType>(trustIdType* ptr, long size);
473template void deleteOnDevice<trustIdType>(trustIdType* ptr, long long size);
474template void deleteOnDevice<trustIdType>(trustIdType* ptr, int size);
475template void deleteOnDevice<float>(float* ptr, long size);
476template void deleteOnDevice<double>(double* ptr, long size);
477template void deleteOnDevice<float>(float* ptr, long long size);
478template void deleteOnDevice<double>(double* ptr, long long size);
479
480template const double* mapToDevice<double>(const TRUSTArray<double,trustIdType>& tab);
481template const int* mapToDevice<int>(const TRUSTArray<int,trustIdType>& tab);
482template const trustIdType* mapToDevice<trustIdType>(const TRUSTArray<trustIdType,trustIdType>& tab);
483template const trustIdType* mapToDevice<trustIdType>(const TRUSTArray<trustIdType,int>& tab);
484template const float* mapToDevice<float>(const TRUSTArray<float,trustIdType>& tab);
485
486template int* computeOnTheDevice(TRUSTArray<int,trustIdType>& tab);
487template trustIdType* computeOnTheDevice(TRUSTArray<trustIdType,trustIdType>& tab);
488template trustIdType* computeOnTheDevice(TRUSTArray<trustIdType,int>& tab);
489template float* computeOnTheDevice(TRUSTArray<float,trustIdType>& tab);
490template double* computeOnTheDevice(TRUSTArray<double,trustIdType>& tab);
491
492template void copyFromDevice<int, trustIdType>(TRUSTArray<int,trustIdType>& tab);
493template void copyFromDevice<trustIdType, int>(TRUSTArray<trustIdType,int>& tab);
494template void copyFromDevice<trustIdType, trustIdType>(TRUSTArray<trustIdType,trustIdType>& tab);
495template void copyFromDevice<float, trustIdType>(TRUSTArray<float,trustIdType>& tab);
496template void copyFromDevice<double, trustIdType>(TRUSTArray<double,trustIdType>& tab);
497
498// With const:
499template void copyFromDevice<int, trustIdType>(const TRUSTArray<int,trustIdType>& tab);
500template void copyFromDevice<trustIdType, int>(const TRUSTArray<trustIdType,int>& tab);
501template void copyFromDevice<trustIdType, trustIdType>(const TRUSTArray<trustIdType,trustIdType>& tab);
502template void copyFromDevice<float, trustIdType>(const TRUSTArray<float,trustIdType>& tab);
503template void copyFromDevice<double, trustIdType>(const TRUSTArray<double,trustIdType>& tab);
504
505
506// Timers GPU (avec possibilite de desactiver avec if (timer) dans certains Kernels critiques sur CPU):
507std::string start_gpu_timer(std::string str, int bytes)
508{
509#ifdef TRUST_USE_GPU
510 if (!statistics().get_init_device())
511 return str;
512 if (statistics().get_gpu_timer())
513 Process::exit("A GPU KERNEL is still running, you can't open a new one yet");
514 statistics().start_gpu_timer();
515 statistics().add_to_gpu_timer_counter(1);
516#ifndef NDEBUG
517 if (statistics().get_gpu_timer_counter()>1)
518 Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : start_gpu_timer() not closed by end_gpu_timer() !" << finl;
519 //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !");
520#endif
521 if (bytes == -1)
522 statistics().begin_count(STD_COUNTERS::gpu_kernel,statistics().get_last_opened_counter_level()+1);
523#ifdef TRUST_USE_CUDA
524 if (!str.empty()) nvtxRangePush(str.c_str());
525#endif
526#ifdef TRUST_USE_ROCM
527 if (!str.empty()) roctxRangePush(str.c_str());
528#endif
529#endif
530 return str;
531}
532
533void end_gpu_timer(const std::string& str, int onDevice, int bytes) // Return in [ms]
534{
535#ifdef TRUST_USE_GPU
536 if (!statistics().get_init_device())
537 return;
538 statistics().add_to_gpu_timer_counter(-1);
539#ifndef NDEBUG
540 if (statistics().get_gpu_timer_counter()!=0)
541 Cerr << "[Kokkos] timer_counter=" << statistics().get_gpu_timer_counter() << " : end_gpu_timer() not opened by start_gpu_timer() !" << finl;
542 //Process::exit("Error, start_gpu_timer() not closed by end_gpu_timer() !");
543#endif
544 if (onDevice)
545 {
546#ifdef TRUST_USE_UVM
547 cudaDeviceSynchronize();
548#endif
549#ifdef KOKKOS
550 if (statistics().get_gpu_fence()) Kokkos::fence(); // Barrier for real time
551#endif
552 }
553 if (bytes == -1)
554 statistics().end_count(STD_COUNTERS::gpu_kernel,onDevice);
555 if (statistics().is_gpu_verbose_on() && Process::je_suis_maitre()) // Affichage
556 {
557 std::string clock(Process::is_parallel() ? "[clock]#" + std::to_string(Process::me()) : "[clock] ");
558 double ms = 1000 * statistics().stop_gpu_timer_and_compute_gpu_time();
559 if (bytes == -1)
560 {
561 if (!str.empty())
562 printf("%s %7.3f ms [%s %15s\n", clock.c_str(), ms, onDevice ? "Device]" : "Host] ", str.c_str());
563 }
564 else
565 {
566 double mo = (double) bytes / 1024 / 1024;
567 if (ms == 0 || bytes == 0)
568 printf("%s [Data] %15s\n", clock.c_str(), str.c_str());
569 else
570 printf("%s %7.3f ms [Data] %15s %6ld Bytes %5.1f Go/s\n", clock.c_str(), ms, str.c_str(),
571 long(bytes), mo / ms);
572 //printf("%s %7.3f ms [Data] %15s %6ld Mo %5.1f Go/s\n", clock.c_str(), ms, str.c_str(), long(mo), mo/ms);
573 }
574 fflush(stdout);
575 }
576 else
577 statistics().stop_gpu_timer();
578
579#ifdef TRUST_USE_CUDA
580 if (!str.empty()) nvtxRangePop();
581#endif
582#ifdef TRUST_USE_ROCM
583 if (!str.empty()) roctxRangePop();
584#endif
585#endif
586}
587
588#endif
589
static const int & get_nb_groups()
static bool is_parallel()
Definition Process.cpp:110
static void imprimer_ram_totale(int all_process=0)
Definition Process.cpp:651
static int me()
renvoie mon rang dans le groupe de communication courant.
Definition Process.cpp:125
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
Definition Process.cpp:455
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
Definition Process.cpp:86
Represents a an array of int/int64/double/... values.
Definition TRUSTArray.h:81
int size_mem()
Definition TRUSTArray.h:249
DataLocation get_data_location()
Definition TRUSTArray.h:245
STORAGE get_mem_storage() const
Definition TRUSTArray.h:171
_TYPE_ * data()
void set_data_location(DataLocation flag)
Definition TRUSTArray.h:247
bool isDataOnDevice() const
static bool isAllocatedOnDevice(void *)
static size_t allocatedBytesOnDevice()
static void add(void *ptr, void *device_ptr, trustIdType bytes)
static map_t & getMemoryMap()
static bool warning(trustIdType nb_items)
static void del(void *ptr)
static size_t deviceMemGetInfo(bool)
static void * addrOnDevice(void *)