TrioCFD 1.9.8
TrioCFD documentation
Loading...
Searching...
No Matches
Perf_counters.cpp
1/****************************************************************************
2* Copyright (c) 2026, CEA
3* All rights reserved.
4*
5* Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
6* 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
7* 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
8* 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission.
9*
10* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
11* IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
12* OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
13*
14*****************************************************************************/
15
16#include <Perf_counters.h>
17#include <stdio.h>
18#include <algorithm>
19#include <string.h>
20#include <vector>
21#include <array>
22#include <map>
23#include <tuple>
24#include <chrono>
25#include <assert.h>
26#include <sys/utsname.h>
27#include <math.h>
28#include <cmath>
29#include <iostream>
30#include <sstream>
31#include <fstream>
32#include <EntreeSortie.h>
33#include <iomanip>
34#include <EcrFicPartage.h>
35#include <memory>
36#include <iomanip>
37#include <TRUST_Version.h>
38#include <thread>
39
40#ifdef TRUST_USE_CUDA
41// See https://nvidia.github.io/NVTX/
42// See https://stackoverflow.com/questions/23230003/something-between-func-and-pretty-function/29856690#29856690
43#include <nvtx3/nvToolsExt.h>
44#include <cuda_runtime.h>
45#define gpuDeviceProp_t cudaDeviceProp
46#define gpuGetDevice cudaGetDevice
47#define gpuGetDeviceProperties cudaGetDeviceProperties
48#define gpuDriverGetVersion cudaDriverGetVersion
49#define gpuRuntimeGetVersion cudaRuntimeGetVersion
50#define VERSION_DIVISOR 1000
51#define VERSION_MOD 100
52#define GPU_SUCCESS cudaSuccess
53#endif
54#ifdef TRUST_USE_ROCM
55#include <rocprofiler-sdk-roctx/roctx.h>
56#include <hip/hip_runtime.h>
57#define gpuDeviceProp_t hipDeviceProp_t
58#define gpuGetDevice hipGetDevice
59#define gpuGetDeviceProperties hipGetDeviceProperties
60#define gpuDriverGetVersion hipDriverGetVersion
61#define gpuRuntimeGetVersion hipRuntimeGetVersion
62#define VERSION_DIVISOR 10000000
63#define VERSION_MOD 100000
64#define GPU_SUCCESS hipSuccess
65#endif
66#define MINFLOAT 1.e-34 // smth small!
67
68
69// Structs used for storing CPU and GPU info
70struct CPUInfo
71{
72 std::string model;
73 long int num_threads;
74};
75
76struct GPUInfo
77{
78 std::string name="None";
79 std::string runtime_version="-10000";
80 std::string driver_version="-10000";
81};
82
83
84/**************************************************************************************************************************
85 *
86 * Introduction of the class counter that described the behavior of a single counter in TRUST
87 *
88 **************************************************************************************************************************/
89
90struct Counter
91{
92
93 using clock = std::chrono::high_resolution_clock;
94 using time_point = std::chrono::time_point<clock>;
95 using duration = std::chrono::duration<double>;
96 inline time_point now()
97 {
98 return clock::now();
99 }
100
101 Counter(int counter_level, std::string counter_name, std::string counter_family = "None", bool is_comm = false, bool is_gpu = false);
102
103 void begin_count_(int counter_level, time_point t);
104
105 void end_count_(int count_increment, long int quantity_increment, time_point t_stop);
106
107 inline void set_parent(Counter * parent_counter) { parent_ = parent_counter;}
108
109 inline double get_time_() const {return total_time_.count();}
110
111 inline bool running_() const { return is_running_; }
112
113 /*! @brief update variables : avg_time_per_step_ , min_time_per_step_ , max_time_per_step_ , sd_time_per_step_
114 *
115 */
116
117 std::array< std::array<double,4> ,4> compute_min_max_avg_sd_() const;
118
119 void reset();
120
121 const std::string description_;
123 const std::string family_ ;
124 const bool is_comm_;
125 const bool is_gpu_;
127 long int quantity_;
130 duration time_alone_; // time when the counter is open minus the time where an counter of lower lvl was open
131 duration time_ts_; // total time tracked during the current time_steps
140}
141;
142
143Counter::Counter(int counter_level, std::string counter_name, std::string counter_family , bool is_comm, bool is_gpu)
144 :description_(counter_name), level_(counter_level), family_(counter_family), is_comm_(is_comm), is_gpu_(is_gpu), count_(0),
145 quantity_( 0), parent_( nullptr),
146 total_time_(duration::zero()),
147 time_alone_(duration::zero()),
148 time_ts_(duration::zero()),
152 avg_time_per_step_( 0.0 ),
153 min_time_per_step_( std::numeric_limits<double>::max()),
154 max_time_per_step_( 0.0 ),
155 var_time_per_step_( 0.0 ),
156 is_running_( false)
157{
158}
159
160void Counter::begin_count_(int counter_level, time_point t)
161{
162 if (counter_level != level_)
163 {
164 level_ = counter_level; ///< You have changed the level of your counter
165 }
166 is_running_ = true;
167 last_open_time_ = t;
169 if (parent_!= nullptr)
170 {
171 parent_->time_alone_ +=duration (t - last_open_time_alone_);
172 parent_->last_open_time_alone_ = time_point();
173 }
174#ifdef TRUST_USE_CUDA
175 if (!is_comm_)
176 nvtxRangePush(description_.c_str());
177#endif
178#ifdef TRUST_USE_ROCM
179 if (!is_comm_)
180 roctxRangePush(description_.c_str());
181#endif
182}
183
184void Counter::end_count_(int count_increment, long int quantity_increment, time_point t_stop)
185{
186 if (!is_running_)
187 Process::exit("Last open_time was not properly set"+ description_);
188 duration t_tot = t_stop-last_open_time_;
189 duration t_alone = t_stop - last_open_time_alone_;
190 quantity_ += quantity_increment;
191 total_time_ += t_tot;
192 time_alone_ += t_alone;
193 count_ += count_increment;
194 if (parent_!= nullptr)
195 {
197 parent_ = nullptr;
198 }
199 is_running_ = false;
203#ifdef TRUST_USE_CUDA
204 if (!is_comm_) nvtxRangePop();
205#endif
206#ifdef TRUST_USE_ROCM
207 if (!is_comm_) roctxRangePop();
208#endif
209}
210
211std::array< std::array<double,4> ,4> Counter::compute_min_max_avg_sd_() const
212{
213 assert(Process::is_parallel());
214 double qty,cnt,min,max,avg,sd ;
215 qty=static_cast<double>(quantity_);
216 cnt = static_cast<double>(count_);
217
218 auto l_compute =[&min, &max,&avg,&sd] (double value)
219 {
220 min = Process::mp_min(value);
221 max = Process::mp_max(value);
222 avg = Process::mp_sum(value)/Process::nproc();
223 sd = sqrt(std::max(0., Process::mp_sum((value-avg)*(value-avg))/Process::nproc()));
224 std::array<double,4> result = {min,max,avg,sd};
225 return result;
226 };
227
228 std::array<double,4> min_max_avg_sd_time = l_compute(total_time_.count());
229
230 std::array<double,4> min_max_avg_sd_quantity = l_compute(qty);
231
232 std::array<double,4> min_max_avg_sd_count_ = l_compute(cnt);
233
234 std::array<double,4> min_max_avg_sd_time_alone_ = l_compute(time_alone_.count());
235
236 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count_,min_max_avg_sd_time_alone_ };
237}
238
240{
241 count_ = 0;
242 quantity_ = 0;
243 avg_time_per_step_ = 0.0 ;
244 min_time_per_step_ = 0.0 ;
245 max_time_per_step_ = 0.0 ;
246 var_time_per_step_ = 0.0 ;
247 open_time_ts_ = now();
248 total_time_ =duration::zero() ;
251 time_alone_=duration::zero() ;; // time when the counter is open minus the time where an counter of lower lvl was open
252 time_ts_=duration::zero() ;; // total time tracked during the current time_steps
253}
254
255/**************************************************************************************************************************
256 *
257 * Declaration of the class Perf_counters using Pimpl idiom
258 *
259 **************************************************************************************************************************/
260/*! @Brief declare all standard counters of TRUST inside an array
261 *
262 */
264{
265public:
266 using clock = std::chrono::high_resolution_clock;
267 using time_point = std::chrono::time_point<clock>;
268 using duration = std::chrono::duration<double>;
270 {
271 return clock::now();
272 }
273 Impl();
275 void create_custom_counter_impl(std::string counter_description, int counter_level, std::string counter_family, bool is_comm, bool is_gpu);
276 void begin_count_impl(const STD_COUNTERS& std_cnt, int counter_lvl);
277 void begin_count_impl(const std::string& custom_count_name, int counter_lvl);
278 void end_count_impl(const STD_COUNTERS& std_cnt, int count_increment, long int quantity_increment);
279 void end_count_impl(const std::string& custom_count_name, int count_increment, long int quantity_increment);
280 void stop_counters_impl();
282 void reset_counters_impl();
283 void set_time_steps_elapsed_impl(int time_step_elapsed);
285 double get_total_time_impl(const STD_COUNTERS& name);
286 double get_total_time_impl(const std::string& name);
287 double get_time_since_last_open_impl(const STD_COUNTERS& name);
288 double get_time_since_last_open_impl(const std::string& name);
289 void start_timeloop_impl();
290 void end_timeloop_impl();
292 void end_time_step_impl(long int tstep);
294 void print_TU_files_impl(const std::string& message);
296 void stop_gpu_timer_impl();
297 double compute_gpu_time_impl();
298 bool is_gpu_verbose_on_impl() const ;
299 void set_gpu_verbose_impl(bool on) ;
300 bool get_init_device_impl() const ;
301 void set_init_device_impl(bool init) ;
302 bool get_gpu_timer_impl() const ;
303 void add_to_gpu_timer_counter_impl(int to_add=1) ;
304 int get_gpu_timer_counter_impl() const ;
305 bool get_use_gpu_impl() const {return use_gpu_;}
306 bool get_gpu_fence_impl() const {return gpu_fence_;}
307 void set_gpu_fence_impl(bool fence) {gpu_fence_=fence;}
308 bool running_impl(const STD_COUNTERS name) { return get_counter(name).running_(); }
309 void record_nb_elem_impl(trustIdType nb_elem) {nb_elem_tot_+=nb_elem;}
310
311private:
312 Counter& get_counter(const STD_COUNTERS name) ;
313 Counter& get_counter(const std::string name);
314 void check_begin(Counter& c, int counter_lvl, time_point t);
315 void check_end(Counter& c, time_point t);
316 double compute_allreduce_peak();
317 std::string get_os() const;
318 CPUInfo get_cpu() const;
319 GPUInfo get_gpu() const;
320 std::string get_date() const;
321 void print_performance_to_csv(const std::string& message);
322 void print_global_TU(const std::string& message);
323 std::array<std::unique_ptr<Counter>, static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER)> std_counters_; ///< Store standard counters using unique_ptr
324 std::map<std::string, std::unique_ptr<Counter>> custom_counter_map_str_to_counter_; ///< Store custom counters using unique_ptr
325 bool end_cache_=false; ///< A flag used to know if the two first time steps are over or not
326 bool time_loop_=false; ///< A flag used to know if we are inside the time loop
327 bool counters_stop_=false; ///< A flag used to know if the counters are paused or not
328 //int counter_lvl_to_print_=1; ///< Counter level that you want to be printed in the global_TU
329 duration computation_time_=duration::zero(); ///< Used to compute the total time of the simulation.
330 duration time_skipped_ts_=duration::zero(); ///< the duration in seconds of the cache. If cache is too long, use function set_three_first_steps_elapsed in oder to include the stats of the cache in your stats
331 Counter* last_opened_counter_=nullptr; ///< pointer to the last opened counter. Each counter has a parent attribute, which also give the pointer of the counter open before them.
332#ifdef TRUST_USE_GPU
333 int nb_steps_elapsed_=1; ///< On GPU, the first time step is not representative: a lot of H->D copies occur yet
334#else
335 int nb_steps_elapsed_=0; ///< By default, we consider that the two first time steps are used to file the cache, so they are not taken into account in the stats.
336#endif
337 int total_nb_backup_=0;
338 double total_data_exchange_per_backup_=0.;
339 bool gpu_verbose_ =false;
340 bool init_device_ = false;
341 bool gpu_timer_ = false;
342 bool use_gpu_=false;
343 bool gpu_fence_=true;
344 time_point gpu_timer_start_;
345 int gpu_timer_count_=0;
346 int max_str_length_=118;
347 trustIdType nb_elem_tot_=0;
348};
350
352{
353 // Initialize standard counters
354 std_counters_[static_cast<int>(STD_COUNTERS::total_execution_time)] = std::make_unique<Counter>(-1, "Total time");
355 std_counters_[static_cast<int>(STD_COUNTERS::computation_start_up)] = std::make_unique<Counter>(0, "Computation start-up");
356 std_counters_[static_cast<int>(STD_COUNTERS::timeloop)] = std::make_unique<Counter>(0, "Time loop");
357 std_counters_[static_cast<int>(STD_COUNTERS::backup_file)] = std::make_unique<Counter>(0, "Back-up operations");
358 std_counters_[static_cast<int>(STD_COUNTERS::system_solver)] = std::make_unique<Counter>(1, "Linear solver resolutions Ax=B");
359 std_counters_[static_cast<int>(STD_COUNTERS::petsc_solver)] = std::make_unique<Counter>(2, "Petsc solver");
360 std_counters_[static_cast<int>(STD_COUNTERS::matrix_assembly)] = std::make_unique<Counter>(1, "Matrix assembly for implicit scheme");
361 std_counters_[static_cast<int>(STD_COUNTERS::ajouter_blocs)] = std::make_unique<Counter>(1, "Call to ::ajouter_blocs for matrix assembly");
362 std_counters_[static_cast<int>(STD_COUNTERS::implicit_diffusion)] = std::make_unique<Counter>(1, "Solver for implicit diffusion");
363 std_counters_[static_cast<int>(STD_COUNTERS::compute_dt)] = std::make_unique<Counter>(1, "Computation of the time step dt");
364 std_counters_[static_cast<int>(STD_COUNTERS::turbulent_viscosity)] = std::make_unique<Counter>(1, "Turbulence model::update");
365 std_counters_[static_cast<int>(STD_COUNTERS::convection)] = std::make_unique<Counter>(1, "Convection operator");
366 std_counters_[static_cast<int>(STD_COUNTERS::diffusion)] = std::make_unique<Counter>(1, "Diffusion operator");
367 std_counters_[static_cast<int>(STD_COUNTERS::gradient)] = std::make_unique<Counter>(1, "Gradient operator");
368 std_counters_[static_cast<int>(STD_COUNTERS::divergence)] = std::make_unique<Counter>(1, "Divergence operator");
369 std_counters_[static_cast<int>(STD_COUNTERS::source_terms)] = std::make_unique<Counter>(1, "Source terms");
370 std_counters_[static_cast<int>(STD_COUNTERS::postreatment)] = std::make_unique<Counter>(1, "Post-treatment operations");
371 std_counters_[static_cast<int>(STD_COUNTERS::restart)] = std::make_unique<Counter>(1, "Read file for restart");
372 std_counters_[static_cast<int>(STD_COUNTERS::update_variables)] = std::make_unique<Counter>(1, "Update ::mettre_a_jour");
373 // MPI communication counters
374 std_counters_[static_cast<int>(STD_COUNTERS::mpi_sendrecv)] = std::make_unique<Counter>(2, "MPI_send_recv", "MPI_sendrecv", true);
375 std_counters_[static_cast<int>(STD_COUNTERS::mpi_send)] = std::make_unique<Counter>(2, "MPI_send", "MPI_sendrecv", true);
376 std_counters_[static_cast<int>(STD_COUNTERS::mpi_recv)] = std::make_unique<Counter>(2, "MPI_recv", "MPI_sendrecv", true);
377 std_counters_[static_cast<int>(STD_COUNTERS::mpi_bcast)] = std::make_unique<Counter>(2, "MPI_broadcast", "MPI_sendrecv", true);
378 std_counters_[static_cast<int>(STD_COUNTERS::mpi_alltoall)] = std::make_unique<Counter>(2, "MPI_alltoall", "MPI_sendrecv", true);
379 std_counters_[static_cast<int>(STD_COUNTERS::mpi_allgather)] = std::make_unique<Counter>(2, "MPI_allgather", "MPI_sendrecv", true);
380 std_counters_[static_cast<int>(STD_COUNTERS::mpi_gather)] = std::make_unique<Counter>(2, "MPI_gather", "MPI_sendrecv", true);
381 std_counters_[static_cast<int>(STD_COUNTERS::mpi_partialsum)] = std::make_unique<Counter>(2, "MPI_partialsum", "MPI_allreduce", true);
382 std_counters_[static_cast<int>(STD_COUNTERS::mpi_sumdouble)] = std::make_unique<Counter>(2, "MPI_sumdouble", "MPI_allreduce", true);
383 std_counters_[static_cast<int>(STD_COUNTERS::mpi_mindouble)] = std::make_unique<Counter>(2, "MPI_mindouble", "MPI_allreduce", true);
384 std_counters_[static_cast<int>(STD_COUNTERS::mpi_maxdouble)] = std::make_unique<Counter>(2, "MPI_maxdouble", "MPI_allreduce", true);
385 std_counters_[static_cast<int>(STD_COUNTERS::mpi_sumfloat)] = std::make_unique<Counter>(2, "MPI_sumfloat", "MPI_allreduce", true);
386 std_counters_[static_cast<int>(STD_COUNTERS::mpi_minfloat)] = std::make_unique<Counter>(2, "MPI_minfloat", "MPI_allreduce", true);
387 std_counters_[static_cast<int>(STD_COUNTERS::mpi_maxfloat)] = std::make_unique<Counter>(2, "MPI_maxfloat", "MPI_allreduce", true);
388 std_counters_[static_cast<int>(STD_COUNTERS::mpi_sumint)] = std::make_unique<Counter>(2, "MPI_sumint", "MPI_allreduce", true);
389 std_counters_[static_cast<int>(STD_COUNTERS::mpi_minint)] = std::make_unique<Counter>(2, "MPI_minint", "MPI_allreduce", true);
390 std_counters_[static_cast<int>(STD_COUNTERS::mpi_maxint)] = std::make_unique<Counter>(2, "MPI_maxint", "MPI_allreduce", true);
391 std_counters_[static_cast<int>(STD_COUNTERS::mpi_barrier)] = std::make_unique<Counter>(2, "MPI_barrier", "MPI_allreduce", true);
392
393 // GPU counters
394 std_counters_[static_cast<int>(STD_COUNTERS::gpu_library)] = std::make_unique<Counter>(2, "GPU_library", "GPU_library", false, true);
395 std_counters_[static_cast<int>(STD_COUNTERS::gpu_kernel)] = std::make_unique<Counter>(2, "GPU_kernel", "GPU_kernel", false, true);
396 std_counters_[static_cast<int>(STD_COUNTERS::gpu_copytodevice)] = std::make_unique<Counter>(2, "GPU_copyToDevice", "GPU_copy", false, true);
397 std_counters_[static_cast<int>(STD_COUNTERS::gpu_copyfromdevice)] = std::make_unique<Counter>(2, "GPU_copyFromDevice","GPU_copy",false, true);
398 std_counters_[static_cast<int>(STD_COUNTERS::gpu_malloc_free)] = std::make_unique<Counter>(2, "GPU_allocations" ,"GPU_alloc",false,true);
399 // Scatter
400 std_counters_[static_cast<int>(STD_COUNTERS::interprete_scatter)] = std::make_unique<Counter>(2, "Scatter_interprete", "None", true,false);
401 std_counters_[static_cast<int>(STD_COUNTERS::virtual_swap)] = std::make_unique<Counter>(2, "DoubleVect/IntVect::virtual_swap", "None", true);
402 std_counters_[static_cast<int>(STD_COUNTERS::read_scatter)] = std::make_unique<Counter>(2, "Scatter::read_domaine", "None", true);
403 //Parallel meshing
404 std_counters_[static_cast<int>(STD_COUNTERS::parallel_meshing)] = std::make_unique<Counter>(0, "Parallel meshing");
405 //IO
406 std_counters_[static_cast<int>(STD_COUNTERS::IO_EcrireFicPartageBin)] = std::make_unique<Counter>(2, "write", "IO");
407 std_counters_[static_cast<int>(STD_COUNTERS::IO_EcrireFicPartageMPIIO)] = std::make_unique<Counter>(2,"MPI_File_write_all", "IO");
408 if (nb_steps_elapsed_==0)
409 end_cache_=true;
410#ifdef TRUST_USE_GPU
411 use_gpu_=true;
412#endif
413}
414/////// Private methods of Pimpl
415
416Counter& Perf_counters::Impl::get_counter(const STD_COUNTERS name)
417{
418 return *std_counters_[static_cast<int>(name)];
419}
420
421Counter& Perf_counters::Impl::get_counter(std::string cust_counter_desc)
422{
423 if (custom_counter_map_str_to_counter_.count(cust_counter_desc)==0)
424 Process::exit("You are trying to find a custom counter that does not exists");
425 return *custom_counter_map_str_to_counter_.at(cust_counter_desc);
426}
427
428void Perf_counters::Impl::check_begin(Counter& c, int counter_lvl, time_point t)
429{
430 if (last_opened_counter_ != nullptr)
431 {
432 if (c.is_running_)
433 Process::exit("The counter that you are trying to start is already running:" + c.description_);
434 /*
435 if (last_opened_counter_->is_comm_)
436 Process::exit("The last open_counter is a communication counter, you should close the communication counter first:" + last_opened_counter_->description_);
437 if (last_opened_counter_->is_gpu_)
438 Process::exit("The last open_counter is a gpu counter, you should close the gpu counter first:" + last_opened_counter_->description_);
439 */
440 int expected_lvl = last_opened_counter_->level_ +1;
441 if (c.is_comm_)
442 {
443 counter_lvl=expected_lvl;
444 }
445 if (counter_lvl != expected_lvl)
446 {
447 std::ostringstream error_msg ;
448 error_msg << "The counter you are trying to start does not have the expected level, counter running: " << last_opened_counter_->description_ << " counter that you try to open: " << c.description_ << " ; expected level: "<< expected_lvl << std::endl ;
449 Process::exit(error_msg.str());
450 }
451 if (time_loop_)
452 {
453 c.open_time_ts_ = t;
454 }
455 c.set_parent(last_opened_counter_);
456 }
457 last_opened_counter_ =&c;
458}
459
460void Perf_counters::Impl::check_end(Counter& c, time_point t)
461{
462 if (!c.is_running_ || last_opened_counter_==nullptr)
463 Process::exit("You are trying to close a counter that is not running: " + c.description_);
464 if (last_opened_counter_ != &c)
465 {
466 std::string error_msg = "The counter you are trying to close is not the last opened, counter: " + c.description_;
467 Process::exit(error_msg);
468 }
469 if (time_loop_)
470 {
471 c.time_ts_ += t - c.open_time_ts_;
473 }
474 last_opened_counter_ = c.parent_;
475 if (c.is_comm_)
476 {
477 c.level_=2;
478 }
479}
480
481double Perf_counters::Impl::compute_allreduce_peak()
482{
484 time_point t1 = now();
485 int i = 0;
486 for (i = 0; i < 100; i++)
487 Process::mp_sum(static_cast<int>(1));
488 time_point t2 = now();
489 duration time = t2-t1;
490 double allreduce_peak_perf = time.count();
491 return Process::mp_min(allreduce_peak_perf)/100.0;
492}
493
494
495std::string delete_blank_spaces(std::string str)
496{
497 std::string result;
498 int space_count =0;
499 for (char ch:str)
500 {
501 if (ch==' ')
502 {
503 space_count ++;
504 if (space_count <=2)
505 result += ch;
506 }
507 else
508 {
509 space_count=0;
510 result+=ch;
511 }
512 }
513 return(result);
514}
515
516/*! @brief Return a string that contains os information
517 *
518 * for example:
519 * node name = is247793
520 * system name = Linux
521 * machine = x86_64
522 * release = 6.5.0-44-generic
523 * version = #44~22.04.1-Ubuntu SMP PREEMPT_DYNAMIC + current date and time
524 *
525 * @return string with is, os info and date
526 */
527std::string Perf_counters::Impl::get_os() const
528{
529 std::string result;
530 struct utsname buffer;
531 if (uname(&buffer) == -1)
532 return "Error: Unable to retrieve OS info";
533 result += std::string(buffer.nodename) + "__";
534 result += std::string(buffer.sysname) + "__";
535 result += std::string(buffer.machine) + "__";
536 result += std::string(buffer.release)+ "__";
537 result += std::string(buffer.version);
538 result = delete_blank_spaces(result);
539 return result.substr(0,max_str_length_);
540}
541
542/*!
543 *
544 * @return string that contains cpu model and number of proc
545 */
546
547
548
549CPUInfo Perf_counters::Impl::get_cpu() const
550{
551 CPUInfo info;
552 info.num_threads = std::thread::hardware_concurrency();
553#if defined(__APPLE__)
554 info.model = "Apple";
555
556#elif defined(__CYGWIN__)
557 info.model = "Cygwin";
558
559#elif defined(__linux__)
560 std::ifstream cpuinfo("/proc/cpuinfo");
561 if (cpuinfo.good())
562 {
563 std::string line;
564 while (std::getline(cpuinfo, line))
565 {
566 if (line.find("model name") != std::string::npos)
567 {
568 size_t pos = line.find(':');
569 if (pos != std::string::npos)
570 {
571 info.model = line.substr(pos + 2); // +2 pour sauter ": "
572 break;
573 }
574 }
575 }
576 }
577 if (info.model.empty())
578 {
579 info.model = "Unknown Linux CPU";
580 }
581#else
582 info.model = "Unknown CPU";
583#endif
584 return info;
585}
586
587/*!
588 *
589 * @return string with gpu model name
590 */
591GPUInfo Perf_counters::Impl::get_gpu() const
592{
593 GPUInfo info;
594#ifdef TRUST_USE_GPU
595
596#if !(defined(TRUST_USE_ROCM) || defined(TRUST_USE_CUDA))
597#error "Neither CUDA nor HIP macros defined, but TRUST_USE_GPU is defined! Something's wrong."
598#endif
599
600 gpuDeviceProp_t prop;
601 int device;
602 int driverVersion, runtimeVersion;
603
604 auto err1=gpuGetDevice(&device);
605 if(err1!=GPU_SUCCESS)
606 Cerr<<"Failed to get GPU device model"<<std::endl;
607 auto err2=gpuGetDeviceProperties(&prop, device);
608 if(err2!=GPU_SUCCESS)
609 Cerr<<"Failed to get GPU device properties"<<std::endl;
610 auto err3=gpuDriverGetVersion(&driverVersion);
611 if(err3!=GPU_SUCCESS)
612 Cerr<<"Failed to get GPU driver version"<<std::endl;
613 auto err4=gpuRuntimeGetVersion(&runtimeVersion);
614 if(err4==GPU_SUCCESS)
615 Cerr<<"Failed to get GPU runtime version"<<std::endl;
616
617 info.name = std::string(prop.name);
618
619 std::ostringstream runtime_stream;
620 runtime_stream << (runtimeVersion / VERSION_DIVISOR) << "."
621 << ((runtimeVersion % VERSION_MOD) / (VERSION_MOD / 100));
622 info.runtime_version = runtime_stream.str();
623
624 std::ostringstream driver_stream;
625 driver_stream << (driverVersion / VERSION_DIVISOR) << "."
626 << ((driverVersion % VERSION_MOD) / (VERSION_MOD / 100));
627 info.driver_version = driver_stream.str();
628#endif
629 return info;
630}
631
632/*!
633 *
634 * @return string with complete date and time : DD-MM-YYYY -- hour:minute:second
635 */
636std::string Perf_counters::Impl::get_date() const
637{
638 time_t now = time(0);
639 std::ostringstream date;
640 struct tm tstruct = *localtime(&now);
641 date<< std::put_time(&tstruct, "%d-%m-%Y -- %X");
642 std::string result = date.str();
643 result = delete_blank_spaces(result);
644 return (result.substr(0,max_str_length_));
645}
646
647static void build_line_csv(std::ostringstream& lines, const std::array<std::string,24>& line_items, const std::array<int,24>& item_size)
648{
649 int size_of_str_to_add = 50;
650 long long int len_line = line_items.size();
651 for (long long int i=0 ; i<len_line ; i++)
652 {
653 size_of_str_to_add = item_size[i];
654 lines << std::setw(size_of_str_to_add) ; ///< Ensure that each item of a column has the same size
655 lines << line_items[i];
656 if (i == len_line -1)
657 lines << std::endl ; ///< if end_line == True, then add a break line but no delimiter
658 else
659 lines << " \t"; ///< Put the column delimiter if we are not at the end of the line
660 }
661}
662
663void clean_stringstream(std::ostringstream& lines)
664{
665 lines.str("");
666 lines.clear();
667}
668
669void Perf_counters::Impl::print_performance_to_csv(const std::string& message)
670{
671 assert(!message.empty());
672 std::ostringstream perfs; ///< Stringstream that contains stats for each processor
673 std::ostringstream perfs_globales; ///< Stringstream that contains stats average on the processors : processor number = -1
674 std::ostringstream file_header; ///< Stringstream that contains the lines at the start of the file
675 trustIdType nb_elem_tot = Process::mp_sum(nb_elem_tot_);
676 const int length_line = 24; ///< number of item of a line of the _csv.Tu file
677 std::array<int,length_line> item_size; ///< Contains the the width of the printed string, 20 for numbers by default
678 for (int& j:item_size)
679 j=20;
680 std::array<std::string,length_line> line_items; ///< Contains the data of a line that we want to print in the _csv.TU file.
681 for (std::string& str :line_items)
682 str="";
683 std::ostringstream tmp_item; ///< Create a temporary ostringstream for converting wanted line items in string to construct the line_items vector and therefore
684 int nb_procs = Process::nproc();
685
686 /// We specify the width of large items of lines of the _csv.Tu file for making it readable by human
687 item_size[0] = 50;
688 item_size[2] = 40;
689 item_size[3] = 45;
690
691 if ( (Process::je_suis_maitre()) && (message == "Computation start-up statistics") )
692 {
693 CPUInfo cpu = get_cpu();
694 file_header << "# Detailed performance log file for case: " << Objet_U::nom_du_cas()<<". See the associated validation form for an example of data analysis"<< std::endl;
695 file_header << "# Date of the computation: " << get_date() << std::endl;
696 file_header << "# OS used: " << get_os() << std::endl;
697 file_header << "# CPU model: " << cpu.model << std::endl;
698 file_header << "# Total number of threads:" << cpu.num_threads << std::endl;
699 if (use_gpu_)
700 {
701 GPUInfo gpu = get_gpu();
702 file_header << "# GPU model: " << gpu.name << std::endl;
703#ifdef TRUST_USE_CUDA
704 file_header << "# CUDA runtime version: " << gpu.runtime_version << std::endl;
705 file_header << "# CUDA drivers version: " << gpu.driver_version << std::endl;
706#endif
707#ifdef TRUST_USE_ROCM
708 file_header << "# HIP runtime version: " << gpu.runtime_version << std::endl;
709 file_header << "# HIP drivers version: " << gpu.driver_version << std::endl;
710#endif
711 }
712 else
713 file_header << "# GPU model: "<< "No GPU used for the computation" << std::endl;
714 file_header << "# Number of processor used = " << nb_procs << std::endl;
715 file_header << "# Total number of elements used for the calculation: " << nb_elem_tot << std::endl;
716 file_header << "# The time was measured by the following method using std::chrono::high_resolution_clock::now() and is printed in seconds" << std::endl ;
717 file_header << "# By default, only averaged statistics on all processor are printed. For accessing the detail per processor, add 'stat_per_proc_perf_log 1' in the data file"<< std::endl;
718 file_header << "# Processor number equal to -1 corresponds to the performance of the calculation averaged on the processors during the simulation step" << std::endl;
719 file_header << "# If a counter does not belong in any particular family, then counter family is set to None" << std::endl;
720 file_header << "# Count means the number of time the counter is called during the overall calculation step." << std::endl;
721 file_header << "# Min, max and SD accounts respectively for the minimum, maximum and Standard Deviation of the quantity of the previous row." << std::endl;
722 file_header << "# Quantity is a custom variable that depends on the counter. It is used to compute bandwidth for communication counters for example. See the table at the end of the introduction on statistics in TRUST form for more details." << std::endl;
723 file_header << "# To retrieve the time not tracked by any counter of level 1 or higher, sum the 'time alone' value of counters of level -1 and 0." << std::endl;
724 file_header << "#" << std::endl << "#" << std::endl;
725 /// Then we create a vector line_items that contains each item we want to print
726 line_items[0] = "Overall_simulation_step";
727 line_items[1] = "Processor_Number";
728 line_items[2] = "Counter_family";
729 line_items[3] = "Counter_name";
730 line_items[4] = "Counter_level";
731 line_items[5] = "Is_comm";
732 line_items[6] = "%_total_time";
733 line_items[7] = "total time";
734 line_items[8] = "t_min";
735 line_items[9] = "t_max";
736 line_items[10] = "t_SD";
737 line_items[11] = "time alone";
738 line_items[12] = "t_alone_min";
739 line_items[13] = "t_alone_max";
740 line_items[14] = "t_alone_SD";
741 line_items[15] = "count";
742 line_items[16] = "time_per_step";
743 line_items[17] = "tps_min";
744 line_items[18] = "tps_max";
745 line_items[19] = "tps_SD";
746 line_items[20] = "Quantity";
747 line_items[21] = "q_min";
748 line_items[22] = "q_max";
749 line_items[23] = "q_SD";
750
751 assert(item_size.size()==length_line);
752 assert(line_items.size()==item_size.size());
753 /// After filling line_items and item_size, we use the function build_line_csv to build the line at the expected format
754 build_line_csv(file_header,line_items,item_size);
755 }
756
757 /// Check if all of the processors see the same number of counter, if not print an error message in perfs_globales
758 bool skip_globals = false;
759 int total_nb_of_counters = static_cast<int>(std_counters_.size()) + static_cast<int>(custom_counter_map_str_to_counter_.size());
760 int min_total_nb_of_counters = total_nb_of_counters;
761 int max_total_nb_of_counters = total_nb_of_counters;
762
763 if ( (max_total_nb_of_counters - min_total_nb_of_counters)!=0 )
764 {
766 {
767 perfs_globales << "Unable to collect statistics :" << std::endl
768 << " there is not the same number of counters on all"
769 " processors."<< std::endl;
770 }
771 skip_globals = true; ///< If min_nb_of_counters != max_nb_of_counters, aggregated stats are not printed
772 }
773 Counter& c_time = get_counter(STD_COUNTERS::timeloop);
774 int nb_ts = c_time.count_- nb_steps_elapsed_;
775
776 if (time_loop_ && nb_ts <= 0)
777 {
779 {
780 perfs_globales << "The computation is shorter than cache" << std::endl;
781 }
782 skip_globals = true; ///< If min_nb_of_counters != max_nb_of_counters, aggregated stats are not printed
783 }
784
785 int level; ///< Level of details of the counter
786 bool is_comm; ///< Equal to 1 if the counter is a communication counter, 0 otherwise
787 int count; ///< number of time the counter is open and closed
788 long int quantity, min_quantity=0, max_quantity=0; ///< A custom quantity which depends on the counter. Used for example to compute the bandwidth
789 double time,time_alone,min_time_alone=0.,max_time_alone=0.,SD_time_alone=0.0;
790 double percent_time=0., min_time=0.0, max_time=0.0; ///< Percent of the total time used in the method tracked by the counter
791 double SD_time=0.0, SD_quantity=0.0; ///< the standard dev of all the prev vars
792 double avg_time_per_step=0., min_time_per_step=0., max_time_per_step=0., sd_time_per_step=0.;
793
794 auto fill_items = [&](int proc_number, const std::string desc, const std::string familly)
795 {
796 tmp_item << message; ///< Convert into string the item we want to print in the line, here the overall simulation step
797 line_items[0] = tmp_item.str(); ///< Add the item to the vector line_itmes, used to construct the line of the _csv.TU file
798
799 tmp_item.str(""); ///< Empties the temporary ostringstream
800
801 tmp_item<< proc_number;
802 line_items[1] = tmp_item.str(); ///< Add the processor number to the vector line_items
803 tmp_item.str("");
804
805 tmp_item<< familly;
806 line_items[2] = tmp_item.str(); ///< Add the counter's family to the vector line_items, null if the counter does not belong in a family
807 tmp_item.str("");
808
809 tmp_item << desc;
810 line_items[3] = tmp_item.str(); ///< Add the counter's name to the vector line_items
811 tmp_item.str("");
812
813 tmp_item<< level;
814 line_items[4] = tmp_item.str(); ///< Add the counter's level to the vector line_items
815 tmp_item.str("");
816
817 tmp_item<< is_comm;
818 line_items[5] = tmp_item.str(); ///< Add 1 if the counter is a communication counter, 0 otherwise
819 tmp_item.str("");
820
821 tmp_item<< std::setprecision(4);
822 tmp_item<< percent_time;
823 line_items[6] = tmp_item.str(); ///< Add the percent of total time used by the operation tracked by counter i to the vector line_items
824 tmp_item.str("");
825
826 tmp_item << std::scientific << std::setprecision(7);
827 tmp_item<< time;
828 line_items[7] = tmp_item.str(); ///< Time elapsed when using the operation tracked by counter i
829 tmp_item.str("");
830
831 tmp_item<< min_time;
832 line_items[8] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
833 tmp_item.str("");
834
835 tmp_item<< max_time;
836 line_items[9] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
837 tmp_item.str("");
838
839 tmp_item<< SD_time;
840 line_items[10] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
841 tmp_item.str("");
842
843 tmp_item<< time_alone;
844 line_items[11] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
845 tmp_item.str("");
846
847 tmp_item<< min_time_alone;
848 line_items[12] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
849 tmp_item.str("");
850
851 tmp_item<< max_time_alone;
852 line_items[13] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
853 tmp_item.str("");
854
855 tmp_item<< SD_time_alone;
856 line_items[14] = tmp_item.str(); ///< Detail per proc, so the min, max, avg and SD on proc is equal to 0
857 tmp_item.str("");
858
859 tmp_item<< count;
860 line_items[15] = tmp_item.str(); ///< Number of time the counter was called on the overall simulation step
861 tmp_item.str("");
862
863 tmp_item<< avg_time_per_step;
864 line_items[16] = tmp_item.str(); ///< Averaged time elapsed by time step for the operation tracked by the counter on the overall simulation step
865 tmp_item.str("");
866
867 tmp_item<< min_time_per_step;
868 line_items[17] = tmp_item.str(); ///< Minimum time elapsed by time step for the operation tracked by the counter on the overall simulation step
869 tmp_item.str("");
870
871 tmp_item<< max_time_per_step;
872 line_items[18] = tmp_item.str(); ///< Maximum time elapsed by time step for the operation tracked by the counter on the overall simulation step
873 tmp_item.str("");
874
875 tmp_item<< sqrt(std::max(0., sd_time_per_step));
876 line_items[19] = tmp_item.str(); ///< Standard Deviation of time elapsed by time step for the operation tracked by the counter on the overall simulation step
877 tmp_item.str("");
878
879 tmp_item<< quantity;
880 line_items[20] = tmp_item.str(); ///< Custom variable that depends on the counter the overall simulation step
881 tmp_item.str("");
882
883 tmp_item<< min_quantity;
884 line_items[21] = tmp_item.str(); ///< Detail per proc, so the min, max and SD on proc is equal to 0
885 tmp_item.str("");
886
887 tmp_item<< max_quantity;
888 line_items[22] = tmp_item.str(); ///< Detail per proc, so the min, max and SD on proc is equal to 0
889 tmp_item.str("");
890
891 tmp_item<< SD_quantity;
892 line_items[23] = tmp_item.str(); ///< Detail per proc, so the min, max and SD on proc is equal to 0
893 tmp_item.str("");
894 };
895
896 auto extract_stats = [&](const Counter & c_lambda)
897 {
898 level = c_lambda.level_; ///< Level of details of the counter
899 is_comm = c_lambda.is_comm_; ///< Equal to 1 if the counter is a communication counter, 0 otherwise
900 time = c_lambda.total_time_.count();
901 time_alone = c_lambda.time_alone_.count();
902 count = c_lambda.count_;
903 quantity = c_lambda.quantity_;
904 avg_time_per_step = c_lambda.avg_time_per_step_;
905 min_time_per_step = c_lambda.min_time_per_step_;
906 max_time_per_step = c_lambda.max_time_per_step_;
907 sd_time_per_step = std::sqrt(std::max(0., c_lambda.var_time_per_step_));
908 min_time = 0.;
909 max_time = 0.;
910 SD_time = 0.;
911 min_quantity = 0.;
912 max_quantity = 0.;
913 SD_quantity = 0.;
914 min_time_alone = 0.;
915 max_time_alone = 0.;
916 SD_time_alone = 0.;
918 {
919 fill_items(Process::me(),c_lambda.description_, c_lambda.family_);
920 build_line_csv(perfs,line_items,item_size); ///< Build the line of the stats associated on the counter i for a single proc
921 }
923 {
924 std::array< std::array<double,4> ,4> table = c_lambda.compute_min_max_avg_sd_();
926 {
927 time = table[0][2];
928 min_time = table[0][0];
929 max_time = table[0][1];
930 SD_time = table[0][3];
931 quantity = static_cast <long int>(std::floor(table[1][2]));
932 min_quantity = static_cast <long int>(std::floor(table[1][0]));
933 max_quantity = static_cast <long int>(std::floor(table[1][1]));
934 SD_quantity = table[1][3];
935 time_alone = table[3][2];
936 min_time_alone = table[3][0];
937 max_time_alone = table[3][1];
938 SD_time_alone = table[3][3];
939 if (! skip_globals )
940 {
941 fill_items(-1,c_lambda.description_,c_lambda.family_);
942 build_line_csv(perfs_globales,line_items,item_size);
943 }
944 }
945 }
946 };
947
948 for (int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
949 {
950 Counter& c_std = *std_counters_[i];
951 extract_stats(c_std);
952 }
953
954 for (const auto & pair : custom_counter_map_str_to_counter_)
955 {
956 if (pair.second!=nullptr)
957 extract_stats(*pair.second);
958 }
959
960 Nom CSV(Objet_U::nom_du_cas());
961 CSV+="_csv.TU";
962 auto printing_mode = ios::app;
963 if (message=="Computation start-up statistics")
964 printing_mode= ios::out;
966 {
967 SFichier file(CSV, printing_mode);
968 file << file_header.str();
969 file << perfs_globales.str();
971 file <<perfs.str();
972 }
975 {
976 EcrFicPartage file(CSV,ios::app);
977 file << perfs.str();
978 file.syncfile();
979 }
980 clean_stringstream(file_header);
981 clean_stringstream(perfs);
982 clean_stringstream(perfs_globales);
983}
984
985/*!@brief Function used for computing communication statistics in the global.TU
986 *
987 * @param time
988 * @param quantity
989 * @param count
990 *
991 * The three parameters are updated by their mean value over the processors
992 * @return an array of array that contains the min, max, average and standard deviation over the processors of the three parameters of the function
993 */
994inline std::array< std::array<double,4> ,3> compute_min_max_avg_sd(double& time, long int& quantity, int& count)
995{
996 double qty,cnt,min,max,avg,sd ;
997 qty=static_cast<double>(quantity);
998 cnt = static_cast<double>(count);
1000 {
1001 auto l_compute =[&min, &max,&avg,&sd] (double value)
1002 {
1003 min = Process::mp_min(value);
1004 max = Process::mp_max(value);
1005 avg = Process::mp_sum(value)/Process::nproc();
1006 sd = sqrt(std::max(0., Process::mp_sum((value-avg)*(value-avg))/Process::nproc()));
1007 std::array<double,4> result = {min,max,avg,sd};
1008 return result;
1009 };
1010 std::array<double,4> min_max_avg_sd_time = l_compute(time);
1011 time =avg;
1012 std::array<double,4> min_max_avg_sd_quantity = l_compute(qty);
1013 std::array<double,4> min_max_avg_sd_count = l_compute(cnt);
1014 count = static_cast<int>(std::floor (avg));
1015 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count};
1016 }
1017 else
1018 {
1019 std::array<double,4> min_max_avg_sd_time = {time,0.,0.,0.};
1020 std::array<double,4> min_max_avg_sd_quantity = {qty,0.,0.,0.};
1021 std::array<double,4> min_max_avg_sd_count = {cnt,0.,0.,0.};
1022 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count};
1023 }
1024}
1025
1026template <typename... Args>
1027std::string fmt(const char* format, Args... args)
1028{
1029 char buf[128];
1030 std::snprintf(buf, sizeof(buf), format, args...);
1031 return std::string(buf);
1032}
1033
1034/*!
1035 *
1036 * @param message
1037 * @param mode_append
1038 */
1039void Perf_counters::Impl::print_global_TU(const std::string& message)
1040{
1041 assert(!message.empty());
1042 std::ostringstream perfs_TU; ///< Stringstream that contains algomerated stats that will be printed in the .TU
1043 std::ostringstream perfs_GPU; ///< Stringstream that contains algomerated stats that will be printed in the .TU
1044 std::ostringstream perfs_IO; ///< Stringstream that contains algomerated stats that will be printed in the .TU
1045 std::ostringstream captions;
1046 std::ostringstream file_header; ///< Stringstream that contains the File header
1047 const int counter_description_width = 40;
1048 const int time_per_step_width= 15;
1049 const int percent_loop_time_width=11;
1050 const int count_per_ts_width=15;
1051 const int level_width=5;
1052 const int bandwith_width= 10;
1053 const int tabular_custom_line_width= counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width+3+level_width;
1054 const int cpu_line_width=counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width;
1055 const int gpu_line_width=counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width+3+bandwith_width+4;
1056 const int number_width=15;
1057 const int text_width =cpu_line_width-count_per_ts_width;
1058 const int header_txt_width = 10;
1059 const int message_width = static_cast<int>(message.length());
1060 const std::string separator = " | ";
1061 const std::string line_sep_cpu(max_str_length_,'~');
1062 const std::string line_sep_tabular(cpu_line_width,'-');
1063 const std::string line_sep_tabular_custom(tabular_custom_line_width,'-');
1064 const std::string line_sep_gpu(gpu_line_width,'-');
1065 int nb_procs = Process::nproc();
1066 double allreduce_peak_perf = compute_allreduce_peak();
1067 double comm_allreduce_t = 0.0, comm_sendrecv_t = 0.0;
1068 long int comm_allreduce_q = 0.0,comm_sendrecv_q = 0.0;
1069 int comm_allreduce_c = 0,comm_sendrecv_c = 0;
1070 std::array< std::array<double,4> ,3> min_max_avg_sd_t_q_c_sendrecv_comm ;
1071 for (std::array<double,4>& arr: min_max_avg_sd_t_q_c_sendrecv_comm)
1072 for (double & d : arr)
1073 d=-1;
1074 std::array< std::array<double,4> ,3> min_max_avg_sd_t_q_c_allreduce_comm = min_max_avg_sd_t_q_c_sendrecv_comm ;
1075 Counter& c_timeloop = get_counter(STD_COUNTERS::timeloop);
1076 int nb_ts = c_timeloop.count_;
1077 nb_ts = std::max(nb_ts,1);
1078 double time_tl=c_timeloop.total_time_.count();
1079 Counter& c_total_time = get_counter(STD_COUNTERS::total_execution_time);
1080 Counter& c_mpi_sendrecv = get_counter(STD_COUNTERS::mpi_sendrecv);
1081 Counter& c_virtual_swap = get_counter(STD_COUNTERS::virtual_swap);
1082 Counter& c_system_solver= get_counter(STD_COUNTERS::system_solver);
1083 Counter& c_backup = get_counter(STD_COUNTERS::backup_file);
1084 Counter& c_todevice = get_counter(STD_COUNTERS::gpu_copytodevice);
1085 Counter& c_gpu_l = get_counter(STD_COUNTERS::gpu_library);
1086 Counter& c_gpu_k = get_counter(STD_COUNTERS::gpu_kernel);
1087 Counter& c_fromdevice = get_counter(STD_COUNTERS::gpu_copyfromdevice);
1088 Counter& c_io_seq = get_counter(STD_COUNTERS::IO_EcrireFicPartageBin);
1089 Counter& c_io_par = get_counter(STD_COUNTERS::IO_EcrireFicPartageMPIIO);
1090 Counter& c_petsc=get_counter(STD_COUNTERS::petsc_solver);
1091 Counter& c_allocfree=get_counter(STD_COUNTERS::gpu_malloc_free);
1092 int petcs_count = Process::mp_max(c_petsc.count_);
1093 int copy_to_device_count = Process::mp_max(c_todevice.count_);
1094 int max_virtual_swap_c = Process::mp_max(c_virtual_swap.count_);
1095 double avg_solv_time = Process::mp_max(c_system_solver.total_time_.count());
1096 double total_time = c_total_time.total_time_.count();
1097 double total_quantity = Process::mp_sum(static_cast<double>(c_backup.quantity_));
1098 trustIdType nb_elem_tot = Process::mp_sum(nb_elem_tot_);
1099 int max_nb_backup = Process::mp_max(c_backup.count_);
1100 double total_comm_time=0.;
1101 int solver_calls= Process::mp_max(c_system_solver.count_);
1102 nb_ts = Process::mp_max(nb_ts);
1103 double cpu_time=0.;
1104 const double nb_it_per_solver_calls= solver_calls>0 ? static_cast<double>(Process::mp_max(static_cast<double>(c_system_solver.quantity_))) /solver_calls :
1105 static_cast<double>(Process::mp_max(static_cast<double>(c_system_solver.quantity_))) ;
1106 if (max_nb_backup>0)
1107 {
1108 total_nb_backup_ += c_backup.count_;
1109 total_data_exchange_per_backup_ += total_quantity / (max_nb_backup *1024*1024);
1110 }
1111
1112 auto write_globalTU_line = [&] (const Counter& c_to_print_,std::ostringstream & line)
1113 {
1114 if (c_to_print_.count_>0 ) //&& c_to_print_.level_==counter_lvl_to_print_)
1115 {
1116 double t_c = c_to_print_.total_time_.count();
1117 int count = c_to_print_.count_;
1118 line << std::left <<std::setw(counter_description_width) << c_to_print_.description_ <<separator ;
1119 double t = nb_ts>0 ? c_to_print_.avg_time_per_step_ : t_c;
1120 line << std::left << std::setw(time_per_step_width) <<t << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << fmt("%4.1f", t_c/time_tl*100);
1121 if (nb_ts>0)
1122 {
1123 double n = static_cast<double>(count)/nb_ts;
1124 line << separator <<std::left << std::setw(count_per_ts_width) << std::round(n) << std::setprecision(7);
1125 }
1126 line << std::endl;
1127 }
1128 };
1129
1130 auto write_globalTU_line_custom_counters = [&] (const Counter& c_to_print_,std::ostringstream & line)
1131 {
1132 if (c_to_print_.count_>0)
1133 {
1134 double t_c = c_to_print_.total_time_.count();
1135 int count = c_to_print_.count_;
1136 line << std::left <<std::setw(counter_description_width) << "Custom_counter::"+c_to_print_.description_ <<separator ;
1137 double t = nb_ts>0 ? t_c/nb_ts : t_c;
1138 line << std::left << std::setw(time_per_step_width) <<t << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << t_c/time_tl*100 ;
1139 if (nb_ts>0)
1140 {
1141 double n = static_cast<double>(count)/nb_ts;
1142 line << separator <<std::left << std::setw(count_per_ts_width) << std::round(n) << std::setprecision(7);
1143 }
1144 line << separator <<std::setw(level_width) << c_to_print_.level_ << std::endl;
1145 }
1146 };
1147 for (int i =static_cast<int>(STD_COUNTERS::backup_file); i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1148 {
1149 Counter& c_com = *std_counters_[i];
1150 {
1151 if (c_com.count_>0)
1152 {
1153 if (c_com.is_comm_)
1154 {
1155 if (c_com.family_=="MPI_allreduce")
1156 {
1157 comm_allreduce_q += c_com.quantity_;
1158 comm_allreduce_t += c_com.total_time_.count();
1159 comm_allreduce_c += c_com.count_;
1160 }
1161 if (c_com.family_=="MPI_sendrecv")
1162 {
1163 comm_sendrecv_q += c_com.quantity_;
1164 comm_sendrecv_t += c_com.total_time_.count();
1165 comm_sendrecv_c += c_com.count_;
1166 }
1167 total_comm_time += c_com.time_alone_.count();
1168 }
1169 else if (!c_com.is_gpu_)
1170 cpu_time+=c_com.time_alone_.count();
1171 }
1172 }
1173 }
1174 for (const auto & pair : custom_counter_map_str_to_counter_)
1175 {
1176 Counter& c_com = *pair.second;
1177 if (c_com.is_comm_)
1178 {
1179 if (c_com.family_=="MPI_allreduce")
1180 {
1181 comm_allreduce_q += c_com.quantity_;
1182 comm_allreduce_t += c_com.total_time_.count();
1183 comm_allreduce_c += c_com.count_;
1184 }
1185 if (pair.second->family_=="MPI_sendrecv")
1186 {
1187 comm_sendrecv_q += c_com.quantity_;
1188 comm_sendrecv_t += c_com.total_time_.count();
1189 comm_sendrecv_c += c_com.count_;
1190 }
1191 total_comm_time += c_com.time_alone_.count();
1192 }
1193 else if (!c_com.is_gpu_)
1194 cpu_time+=c_com.time_alone_.count();
1195 }
1196 // IO part
1197 // Estimates bandwidth
1198 double bandwidth = 1.1e30;
1199 if (c_mpi_sendrecv.total_time_.count()>0)
1200 bandwidth = static_cast<double>(c_mpi_sendrecv.quantity_)/ (c_mpi_sendrecv.total_time_.count() + MINFLOAT);
1201
1202 double max_bandwidth = Process::mp_max(bandwidth);
1203 // Compute wait time due to synch
1204 // We take the total communication time and we substract the theoretical tume computed with allreduce_peak_perf and max bandwidth
1205 double theoric_comm_time = 0.0;
1206 if(max_bandwidth)
1207 theoric_comm_time = static_cast<double>(comm_allreduce_c) * allreduce_peak_perf + static_cast<double>(comm_sendrecv_c) / max_bandwidth;
1208 // Je suppose que le temps minimum pour realiser les communications sur un proc
1209 // depend du processeur qui a le plus de donnees a envoyer:
1210 theoric_comm_time = Process::mp_max(theoric_comm_time);
1211 double total_time_avg=0.0, total_time_max=0.0;
1212 if(nb_ts >0)
1213 {
1214 total_time_avg = Process::mp_sum(c_timeloop.total_time_.count())/nb_procs;
1215 total_time_max = Process::mp_max(c_timeloop.total_time_.count());
1216 }
1217 else
1218 {
1219 total_time_avg = Process::mp_sum(c_total_time.total_time_.count())/nb_procs;
1220 total_time_max = Process::mp_max(c_total_time.total_time_.count());
1221 }
1222 double wait_time = (comm_sendrecv_t+ comm_allreduce_t)- theoric_comm_time;
1223 double wait_fraction;
1224 if (total_time_avg == 0)
1225 wait_fraction = 0.;
1226 else
1227 wait_fraction = wait_time / (total_time_avg + MINFLOAT);
1228 wait_fraction = 0.1 * floor(wait_fraction * 1000);
1229 if (wait_fraction < 0.)
1230 wait_fraction = 0.;
1231 if (wait_fraction > 100.)
1232 wait_fraction = 100.;
1233
1234 double max_wait_fraction = Process::mp_max(wait_fraction);
1235 double min_wait_fraction = Process::mp_min(wait_fraction);
1236 double avg_wait_fraction = Process::mp_sum(wait_fraction)/ nb_procs;
1237 int debit_seq = 0;
1238 int debit_par = 0;
1239 double com_time_seq = Process::mp_max(c_io_seq.total_time_.count());
1240 double com_time_par = Process::mp_max(c_io_par.total_time_.count());
1241 if (com_time_seq>0)
1242 debit_seq = static_cast<int>(std::floor(static_cast<double>(Process::mp_sum(static_cast<double>(c_io_seq.quantity_))) / (1024 * 1024) /com_time_seq));
1243 if (com_time_par>0)
1244 debit_par = static_cast<int>(std::floor(static_cast<double>(Process::mp_sum(static_cast<double>(c_io_par.quantity_))) / (1024 * 1024) /com_time_par));
1246 {
1247 min_max_avg_sd_t_q_c_allreduce_comm = compute_min_max_avg_sd(comm_allreduce_t,comm_allreduce_q,comm_allreduce_c);
1248 min_max_avg_sd_t_q_c_sendrecv_comm = compute_min_max_avg_sd(comm_sendrecv_t,comm_sendrecv_q,comm_sendrecv_c);
1249 }
1251 {
1252 std::string spaces;
1253 if (message == "Computation start-up statistics")
1254 {
1255 CPUInfo cpu = get_cpu();
1256 spaces.assign((max_str_length_-27)/2,' ');
1257 file_header << spaces <<"# Global performance file #"<< std::endl;
1258 file_header << std::endl;
1259 file_header << "This is the global file for tracking performance in TRUST. It stores aggregated quantities." <<std::endl;
1260 file_header << "More detailed statistics can be found in the "<< Objet_U::nom_du_cas() <<"_csv.TU file" <<std::endl;
1261 //file_header << "A jupyter notebook giving detailed information about performance measurement can be found in:" << std::endl << " $TRUST_ROOT/Validation/Rapports_automatiques/Verification/HowTo/" <<std::endl;
1262 file_header << "For time loop, only standard counters of level 1 are printed alongside your custom counters" << std::endl;
1263 file_header <<"Time is given in seconds"<< std::endl <<std::endl;
1264 file_header << line_sep_cpu << std::endl;
1265 spaces.assign((max_str_length_-26)/2,' ');
1266 file_header << spaces <<"Context of the computation"<< std::endl;
1267 file_header << line_sep_cpu << std::endl;
1268 file_header << std::left << std::setw(header_txt_width)<< "Date:" << get_date() << std::endl;
1269 file_header << std::left << std::setw(header_txt_width)<< "OS:" << get_os() << std::endl;
1270 file_header << std::left << std::setw(header_txt_width) << "CPU model : " << cpu.model << std::endl;
1271 file_header << std::left << std::setw(header_txt_width) << "Total number of threads:" << cpu.num_threads << std::endl;
1272 if (use_gpu_)
1273 {
1274 GPUInfo gpu = get_gpu();
1275 file_header << "GPU model: " << gpu.name << std::endl;
1276#ifdef TRUST_USE_CUDA
1277 file_header << "CUDA runtime version: " << gpu.runtime_version << std::endl;
1278 file_header << "CUDA drivers version: " << gpu.driver_version << std::endl;
1279#endif
1280#ifdef TRUST_USE_ROCM
1281 file_header << "HIP runtime version: " << gpu.runtime_version << std::endl;
1282 file_header << "HIP drivers version: " << gpu.driver_version << std::endl;
1283#endif
1284 }
1285 else
1286 file_header << "GPU model: "<< "No GPU used for the computation" << std::endl;
1287 file_header << std::left << std::setw(header_txt_width) << "Nb procs used for the computation: " << nb_procs << std::endl;
1288 file_header << std::left << std::setw(header_txt_width) << "TRUST version: " << TRUST_VERSION << std::endl ;
1289 file_header << std::left << std::setw(header_txt_width) << "Total number of elements used for the calculation: " << nb_elem_tot << std::endl << std::endl;
1290 file_header << line_sep_cpu << std::endl;
1291 spaces.assign((max_str_length_-message_width)/2,' ');
1292 file_header << spaces<<message << std::endl;
1293 file_header << line_sep_cpu << std::endl;
1294 file_header << std::left << std::setw(text_width)<<"Total time of the start-up: " << std::left <<std::setw(number_width) << c_total_time.total_time_.count() << std::endl;
1295 }
1296 else if (message == "Time loop statistics")
1297 {
1298 file_header << line_sep_cpu << std::endl;
1299 spaces.assign((max_str_length_-message_width)/2,' ');
1300 file_header << spaces<<message << std::endl;
1301 file_header << line_sep_cpu << std::endl;
1302 if (nb_ts <= 0)
1303 {
1304 Cerr << "No time step after cache filling was computed" << finl;
1305 return;
1306 }
1307 if (nb_steps_elapsed_>0)
1308 {
1309 if (nb_steps_elapsed_>1)
1310 file_header << "The " << nb_steps_elapsed_<< " first time steps are not accounted for the computation of the time loop statistics"<< std::endl;
1311 else
1312 file_header << "The first time step is not accounted for the computation of the time loop statistics"<< std::endl;
1313 }
1314 file_header << std::left <<std::setw(text_width)<< "Total time of the time loop: "<< std::left <<std::setw(number_width) << time_tl << std::endl;
1315 file_header << std::left <<std::setw(text_width) << "Number of time steps: " << std::left <<std::setw(number_width) << nb_ts << std::endl;
1316 file_header << std::left <<std::setw(text_width) << "Skipped time steps: " << std::left <<std::setw(number_width) << nb_steps_elapsed_ << std::endl;
1317 file_header << std::left <<std::setw(text_width) << "Average time per time step: " << std::left <<std::setw(number_width) << time_tl/nb_ts << endl;
1318 file_header << std::left <<std::setw(text_width) << "Standard deviation between time steps: " << std::left <<std::setw(number_width) << std::sqrt(std::max(0., c_timeloop.var_time_per_step_)) << std::endl;
1319 file_header << std::left <<std::setw(text_width) << "Time elapsed in the skipped time steps: " << std::left <<std::setw(number_width) << time_skipped_ts_.count() <<std::endl << std::endl;
1321 file_header << std::left <<std::setw(text_width) << "Percent of total time spend in communication: " << std::left <<std::setw(number_width) << 100* total_comm_time / total_time << std::endl;
1322 }
1323 else if (message == "Post-resolution statistics")
1324 {
1325 file_header << line_sep_cpu << std::endl;
1326 spaces.assign((max_str_length_-message_width)/2,' ');
1327 file_header << spaces<<message << std::endl;
1328 file_header << line_sep_cpu << std::endl;
1329 file_header << std::left <<std::setw(text_width) << "Time of the post-resolution: " << std::left <<std::setw(number_width) << c_total_time.total_time_.count() << std::endl;
1330 captions << std::endl;
1332 {
1333 captions << line_sep_cpu << std::endl;
1334 captions << "Max waiting time big => probably due to a bad partitioning" << std::endl;
1335 captions << "Communications > 30% => too many processors or network too slow" << std::endl;
1336 captions << line_sep_cpu << std::endl;
1337 captions << std::endl;
1338 }
1339 captions << std::left << std::setw(text_width) << "Total time for the whole computation" << std::left <<std::setw(number_width) << computation_time_.count()<< std::endl<< std::endl;
1340 }
1341 else
1342 Process::exit("You are trying to get stats of an unknown computation step");
1343
1344 if(message == "Time loop statistics" && c_total_time.total_time_.count()>1.0e-12 && c_timeloop.total_time_.count()>1.0e-12)
1345 {
1346 total_time = c_total_time.total_time_.count();
1347 double other = time_tl/nb_ts;
1348 perfs_TU<<std::endl;
1349 perfs_TU << std::left <<std::setw(counter_description_width) << "Standard counter description" << separator << std::setw(time_per_step_width) << "Time/step" << separator << std::setw(percent_loop_time_width) << "% loop time" << separator << std::setw(count_per_ts_width) << "Call(s)/step"<<std::endl;
1350 perfs_TU << line_sep_tabular << std::endl;
1351 for (int i = static_cast<int>(STD_COUNTERS::system_solver); i< static_cast<int>(STD_COUNTERS::petsc_solver); i++)
1352 {
1353 Counter& c_to_print = *std_counters_[i];
1354 other -= c_to_print.avg_time_per_step_;
1355 write_globalTU_line(c_to_print,perfs_TU);
1356 }
1357 // Loop on the custom counters
1358 if (!custom_counter_map_str_to_counter_.empty())
1359 {
1360 perfs_TU << std::endl;
1361 perfs_TU << std::left <<std::setw(counter_description_width) << "Custom counter description" << separator << std::setw(time_per_step_width) << "Time/step" << separator << std::setw(percent_loop_time_width) << "% loop time" << separator << std::setw(count_per_ts_width) << "Call(s)/step"<< separator <<std::setw(level_width) << "Level" <<std::endl;
1362 perfs_TU << line_sep_tabular_custom<<std::endl;
1363 for (const auto & pair : custom_counter_map_str_to_counter_)
1364 {
1365 Counter& c_to_print = *pair.second;
1366 write_globalTU_line_custom_counters(c_to_print, perfs_TU);
1367 }
1368 }
1369 perfs_TU << std::left <<std::setw(counter_description_width) << "Other operations" << separator << std::setw(time_per_step_width) << other << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << fmt("%4.1f", other/(time_tl/nb_ts)*100) << separator <<std::endl;
1370 }
1371 if (max_virtual_swap_c>0)
1372 {
1373 if (message=="Time loop statistics")
1374 {
1375 if(nb_ts>0)
1376 perfs_TU << std::left <<std::setw(text_width) << "Number of virtual exchanges per time step:" << std::left <<std::setw(number_width) << max_virtual_swap_c/nb_ts << std::endl;
1377 else
1378 perfs_TU << std::left <<std::setw(text_width) << "Number of virtual exchanges" << std::left <<std::setw(number_width) << max_virtual_swap_c << std::endl;
1379 }
1380 else
1381 perfs_TU << std::left <<std::setw(text_width) << "Number of virtual exchanges:" << std::left <<std::setw(number_width) << max_virtual_swap_c << std::endl;
1382 }
1383 if (min_max_avg_sd_t_q_c_allreduce_comm[2][1]>0 && nb_ts>0)
1384 {
1385 double allreduce_per_ts = (double) min_max_avg_sd_t_q_c_allreduce_comm[2][1]/nb_ts;
1386 perfs_TU << std::left <<std::setw(text_width) << "Maximum number of MPI allreduce per time step" << std::left <<std::setw(number_width) << allreduce_per_ts << std::endl;
1387 if (allreduce_per_ts > 30.0 && message=="Time loop statistics")
1388 {
1389 perfs_TU << std::endl << line_sep_cpu << std::endl;
1390 perfs_TU << " Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations" << std::endl;
1391 perfs_TU << line_sep_cpu<< std::endl;
1392 }
1393 }
1394 int tmp = c_system_solver.count_;
1395 if (tmp > 0)
1396 {
1397 perfs_TU << std::endl;
1398 avg_solv_time = avg_solv_time / tmp;
1399 if (!(message=="Time loop statistics"))
1400 {
1401 if (nb_ts>0)
1402 perfs_TU << std::left <<std::setw(text_width) << "Number of calls to the linear solver per time step: " << std::left <<std::setw(number_width) << static_cast<double>(tmp) / nb_ts << std::endl;
1403 else
1404 perfs_TU << std::left <<std::setw(text_width) << "Number of call to the linear solver: " << std::left <<std::setw(number_width) << tmp << std::endl;
1405 perfs_TU << std::left <<std::setw(text_width) << "Average time of the resolution of the linear problem per call: " << std::left <<std::setw(number_width) << avg_solv_time << std::endl;
1406 }
1407 perfs_TU << std::left <<std::setw(text_width) << "Average number of iteration of the linear solver per call: " << std::left <<std::setw(number_width) << nb_it_per_solver_calls << std::endl << std::endl;
1408 }
1409 // GPU part of the TU :
1410 auto compute_percent_and_write_tabular_line = [& perfs_GPU, & nb_ts, & time_tl, &separator] (const Counter& c_, const std::string str)
1411 {
1412 double max_time = c_.time_alone_.count();
1413 double calls = c_.count_/nb_ts;
1414 double t_ts = max_time/nb_ts;
1415 double bw = max_time>0 ? static_cast<double>(c_.quantity_)/(1024.*1024.*1024*max_time) : 0.;
1416 double percent = 100*max_time/time_tl;
1417 perfs_GPU << std::left << std::setw(counter_description_width) << str <<separator << std::setw(time_per_step_width) << t_ts<<separator << std::setw(percent_loop_time_width) << fmt("%4.1f", percent) <<separator<< std::setw(count_per_ts_width) << calls <<separator;
1418 if (bw >1.0e-10)
1419 perfs_GPU << fmt("%3.1f GB/s", bw) << std::endl;
1420 //perfs_GPU << std::setw(bandwith_width) << bw << "GB/s" << std::endl;
1421 else
1422 perfs_GPU << std::endl;
1423 return percent;
1424 };
1425 if (copy_to_device_count>0 && nb_ts >0 && message=="Time loop statistics")
1426 {
1427 perfs_GPU << std::endl << line_sep_gpu << std::endl;
1428 spaces.assign((max_str_length_-14)/2,' ');
1429 perfs_GPU << spaces <<"GPU statistics" << std::endl;
1430 perfs_GPU << line_sep_gpu<<std::endl;
1431 perfs_GPU << std::left <<std::setw(counter_description_width) << "Counter description" << separator <<std::setw(time_per_step_width) << "Time per step" <<separator<< std::setw(percent_loop_time_width) << "% loop time" <<separator<< std::setw(count_per_ts_width) << "Call(s)/step" <<separator<< std::setw(bandwith_width)<< "Bandwidth"<<std::endl;
1432 perfs_GPU << line_sep_gpu << std::endl;
1433 double ratio_gpu_library = compute_percent_and_write_tabular_line(c_gpu_l,"Libraries: ");
1434 double ratio_gpu_kernel = compute_percent_and_write_tabular_line(c_gpu_k,"Kernels: ");
1435 double ratio_gpu = ratio_gpu_kernel+ratio_gpu_library;
1436 double ratio_copy = compute_percent_and_write_tabular_line(c_todevice,"Copy host to device: ");
1437 ratio_copy += compute_percent_and_write_tabular_line(c_fromdevice,"Copy device to host: ");
1438 double ratio_comm = 100.0 * (total_comm_time)/time_tl;
1439 double ratio_allocfree = compute_percent_and_write_tabular_line(c_allocfree,"Alloc/Free on device: ");
1440 double ratio_cpu = 100 * cpu_time/time_tl;
1441 // PL: I prefer this formulae:
1442 ratio_cpu = 100 - ratio_gpu - ratio_copy - ratio_allocfree - ratio_comm;
1443 perfs_GPU << std::setprecision(2) << "GPU: " << ratio_gpu << "% Copy H<->D: " << ratio_copy << "% Alloc/free: " << ratio_allocfree << "% Comm: "<< ratio_comm << "% CPU & I/O: " << ratio_cpu <<"%"<<std::endl;
1444 if (ratio_gpu<50)
1445 {
1446 Cerr << "==============================================================================================" << finl;
1447 Cerr << "[GPU] Warning: Only " << 0.1*int(10*ratio_gpu) << " % of the time calculation is spent on GPU." << finl;
1448 if (ratio_gpu_library==0)
1449 Cerr << "[GPU] First add a GPU solver !" << finl;
1450 else
1451 Cerr << "[GPU] Probably some algorithms used are not ported yet on GPU. Contact TRUST team." << finl;
1452 Cerr << "==============================================================================================" << finl;
1453 }
1454 }
1455 if (message=="Time loop statistics")
1456 {
1457 if (debit_seq>0 || debit_par>0)
1458 {
1459 perfs_IO << std::endl << line_sep_cpu << std::endl;
1460 spaces.assign((max_str_length_-message_width-4)/2,' ');
1461 perfs_IO << spaces << message <<": IO" << std::endl;
1462 perfs_IO << line_sep_cpu<<std::endl;
1463 }
1464 if (debit_seq>0)
1465 perfs_IO << std::left <<std::setw(text_width) << "Output write sequential: " << std::left <<std::setw(number_width) << debit_seq << "MB/s"<< std::endl;
1466 if (debit_par>0)
1467 perfs_IO << std::left <<std::setw(text_width) << "Output write parallel: " << std::left <<std::setw(number_width) << debit_par << "MB/s" << std::endl;
1468 if (total_nb_backup_>0)
1469 {
1470 perfs_IO << std::left <<std::setw(text_width) << "Total number of back-up: " << std::left <<std::setw(number_width) << total_nb_backup_ << std::endl;
1471 perfs_IO << std::left <<std::setw(text_width) << "Total amount of data per back-up: " << std::left <<std::setw(number_width) << total_data_exchange_per_backup_ << "MB"<< std::endl;
1472 }
1473 if(min_max_avg_sd_t_q_c_sendrecv_comm[2][1] > 0)
1474 {
1475 if(petcs_count>0)
1476 {
1477 perfs_IO<< std::endl<< "---------------------------------------------------------------------------------------------------------"<< std::endl;
1478 perfs_IO<< "Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated."<< std::endl;
1479 perfs_IO<< "---------------------------------------------------------------------------------------------------------"<< std::endl<< std::endl;
1480 }
1481 double fraction = 0.0;
1482 fraction = (comm_sendrecv_t + comm_allreduce_t)/ (total_time + MINFLOAT);
1483 fraction = 0.1 * floor(fraction * 1000);
1484 if (fraction > 100.)
1485 fraction = 100.;
1486 perfs_IO << std::left <<std::setw(text_width+10) << "Average of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction << "%" << std::endl;
1487 fraction = (min_max_avg_sd_t_q_c_sendrecv_comm[0][1] + min_max_avg_sd_t_q_c_allreduce_comm[0][1])/ (total_time_max + MINFLOAT);
1488 fraction = 0.1 * floor(fraction * 1000);
1489 if (fraction > 100.)
1490 fraction = 100.;
1491 perfs_IO << std::left <<std::setw(text_width+10) << "Max of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction << "%" << std::endl;
1492 fraction = (min_max_avg_sd_t_q_c_sendrecv_comm[0][0] + min_max_avg_sd_t_q_c_allreduce_comm[0][0])/ (total_time_max + MINFLOAT);
1493 fraction = 0.1 * floor(fraction * 1000);
1494 perfs_IO << std::left <<std::setw(text_width+10) << "Min of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction << "%" << std::endl;
1495 perfs_IO << std::left <<std::setw(text_width+10) << "Time of one mpsum measured by an internal bench over 0.1s (network latency): ";
1496 if (allreduce_peak_perf == 0.)
1497 perfs_IO << "not measured (total running time too short <10s)" << std::endl;
1498 else
1499 perfs_IO << std::left <<std::setw(number_width) << allreduce_peak_perf << std::endl;
1500 perfs_IO << std::left <<std::setw(text_width+10) << "Network maximum bandwidth on all processors: " << std::left <<std::setw(number_width) << fmt("%4.1f GB/s",max_bandwidth * 1.e-9) << std::endl ;
1501 if (nb_ts>0)
1502 perfs_IO << std::left <<std::setw(text_width+10) << "Total network traffic: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) * Process::nproc() / nb_ts * 1e-6 << "MB/time step" << std::endl;
1503 else
1504 perfs_IO << std::left <<std::setw(text_width+10) << "Total network traffic: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) * Process::nproc()* 1e-6 << "MB" << std::endl;
1505 perfs_IO << std::left <<std::setw(text_width+10) << "Average message size: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) / static_cast<double>(comm_sendrecv_c)* 1e-3 << "kB" << std::endl;
1506 perfs_IO << std::left <<std::setw(text_width+10) << "Min waiting time: " << std::left <<std::setw(number_width) << min_wait_fraction << "% of total time"<< std::endl;;
1507 perfs_IO << std::left <<std::setw(text_width+10) << "Max waiting time: " << std::left <<std::setw(number_width) << max_wait_fraction<< "% of total time"<< std::endl;;
1508 perfs_IO << std::left <<std::setw(text_width+10) << "Avg waiting time: " << std::left <<std::setw(number_width) << avg_wait_fraction<< "% of total time"<< std::endl;;
1509 }
1510 }
1511 // Concatenate stringtreams in order to print the .TU file
1512 Nom globalTU(Objet_U::nom_du_cas());
1513 globalTU +=".TU";
1514 auto printing_mode = ios::app;
1515 if (message=="Computation start-up statistics")
1516 printing_mode= ios::out;
1517 SFichier file(globalTU, printing_mode);
1518 file << file_header.str();
1519 file << perfs_TU.str();
1520 file << perfs_IO.str();
1521 file << perfs_GPU.str();
1522 file << captions.str();
1523 clean_stringstream(file_header);
1524 clean_stringstream(perfs_TU);
1525 clean_stringstream(perfs_GPU);
1526 clean_stringstream(perfs_IO);
1527 clean_stringstream(captions);
1528 }
1529}
1530
1531/////////////////////////////////////// Public methods of Pimpl ////////////////////////////////////////////
1532
1533void Perf_counters::Impl::create_custom_counter_impl(std::string counter_description , int counter_level, std::string counter_family , bool is_comm, bool is_gpu)
1534{
1535 if (counter_level <=0)
1536 Process::exit("Custom counters should not be set with a zero or negative level value");
1537 if (custom_counter_map_str_to_counter_.count(counter_description)==0)
1538 {
1539 auto result =custom_counter_map_str_to_counter_.emplace(counter_description, std::make_unique<Counter>(counter_level, counter_description, counter_family ,is_comm, is_gpu));
1540 if (!result.second)
1541 Process::exit("Failed to insert the new custom counter in the custom counter map");
1542 }
1543}
1544
1545void Perf_counters::Impl::begin_count_impl(const STD_COUNTERS& std_cnt, int counter_lvl)
1546{
1547 if (!counters_stop_)
1548 {
1549 Counter& c = get_counter(std_cnt);
1550 if (counter_lvl == -100000)
1551 counter_lvl = c.level_;
1552 time_point t = now();
1553 check_begin(c, counter_lvl,t);
1554 c.begin_count_(counter_lvl,t);
1555 }
1556}
1557
1558/*!
1559 *
1560 * @param custom_count_name key of the map (custom_counter_map_str_to_counter_) of the custom counter you try to close
1561 * @param counter_lvl
1562 */
1563void Perf_counters::Impl::begin_count_impl(const std::string& custom_count_name, int counter_lvl)
1564{
1565 if (!counters_stop_)
1566 {
1567 Counter& c = get_counter(custom_count_name);
1568 if (counter_lvl == -100000)
1569 counter_lvl = c.level_;
1570 time_point t = now();
1571 check_begin(c, counter_lvl,t);
1572 c.begin_count_(counter_lvl,t);
1573 }
1574}
1575
1576/*! @brief End the count of a counter and update the counter values
1577 *
1578 * @param c is the counter to end the count
1579 * @param count_increment is the count increment. If not specified, then it is equal to 1
1580 * @param quantity_increment is the increment of custom variable quantity. If not specified, it is set to 0.
1581 */
1582void Perf_counters::Impl::end_count_impl(const STD_COUNTERS& std_cnt, int count_increment, long int quantity_increment)
1583{
1584 if (!counters_stop_)
1585 {
1586 Counter& c = get_counter(std_cnt);
1587 time_point t = now();
1588 check_end(c, t);
1589 c.end_count_(count_increment, quantity_increment,t);
1590 if (c.level_ == -1)
1591 computation_time_ += c.total_time_;
1592 }
1593}
1594
1595
1596/*! @brief End the count of a counter and update the counter values
1597 *
1598 * @param c is the custom counter to end the count
1599 * @param count_increment is the count increment. If not specified, then it is equal to 1
1600 * @param quantity_increment is the increment of custom variable quantity. If not specified, it is set to 0.
1601 */
1602void Perf_counters::Impl::end_count_impl(const std::string& custom_count_name, int count_increment, long int quantity_increment)
1603{
1604 if (!counters_stop_)
1605 {
1606 Counter& c = get_counter(custom_count_name);
1607 time_point t = now();
1608 assert(custom_counter_map_str_to_counter_.count(custom_count_name) > 0);
1609 check_end(c, t);
1610 c.end_count_(count_increment, quantity_increment,t);
1611 }
1612}
1613
1614
1615/*! @brief Stop all counters, has to be called on every processor simultaneously
1616 *
1617 */
1619{
1620 time_point t_stop = now();
1621 duration time_elapsed_before_stop;
1622 if (counters_stop_)
1623 Process::exit("The counter are already stop, you can't stop them two times in a row \n");
1624 if (last_opened_counter_ != nullptr)
1625 {
1626 Counter* c = last_opened_counter_;
1627 c->time_alone_ += t_stop -c->last_open_time_alone_;
1629 while (c!=nullptr && c->level_ >= 0) // do not stop highest level counter
1630 {
1631 time_elapsed_before_stop= t_stop -c->last_open_time_;
1632 if (time_loop_ )
1633 c->time_ts_ += time_elapsed_before_stop;
1634 c->total_time_ += time_elapsed_before_stop;
1635 c = c->parent_;
1636 }
1637 }
1638 counters_stop_=true;
1639}
1640
1641/*! @brief Restart all counters, has to be called on every processor simultaneously
1642 *
1643 */
1645{
1646 time_point t_restart = now();
1647 if (counters_stop_==false)
1648 Process::exit("Try to restart counters but they have never been stopped before");
1649 if (last_opened_counter_ != nullptr)
1650 {
1651 Counter* c = last_opened_counter_;
1652 c->last_open_time_alone_ = t_restart;
1653 while (c !=nullptr && c->level_ >= 0) // do not touch top level counter
1654 {
1655 c->last_open_time_ = t_restart;
1656 if (time_loop_)
1657 c->open_time_ts_ = t_restart;
1658 c = c->parent_;
1659 }
1660 }
1661 counters_stop_=false;
1662}
1663
1665{
1666 // Reset all counters except the highest level one, hence starting the loop at 1:
1667 for (int i = 1 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1668 {
1669 Counter& c = *std_counters_[i];
1670 c.reset();
1671 }
1672 for (const auto & pair : custom_counter_map_str_to_counter_)
1673 {
1674 Counter& c = *pair.second;
1675 c.reset();
1676 }
1677}
1678
1680{
1681 nb_steps_elapsed_=time_step_elapsed;
1682 if (time_step_elapsed>0)
1683 end_cache_ =false;
1684}
1685
1687{
1688 if (counters_stop_)
1689 Process::exit("The counters are stop, you can't access the total time");
1690 Counter& c = get_counter(STD_COUNTERS::total_execution_time);
1691 if (c.is_running_)
1692 computation_time_+= now() - c.last_open_time_;
1693 return (computation_time_.count());
1694}
1695
1696double Perf_counters::Impl::get_total_time_impl(const STD_COUNTERS& name)
1697{
1698 Counter& c = get_counter(name);
1699 duration t = c.total_time_;
1700 if (c.is_running_)
1701 t += now() - c.last_open_time_;
1702 return (t.count());
1703}
1704
1705double Perf_counters::Impl::get_total_time_impl(const std::string& name)
1706{
1707 Counter& c = get_counter(name);
1708 duration t = c.total_time_;
1709 if (c.is_running_)
1710 t += now() - c.last_open_time_;
1711 return (t.count());
1712}
1713
1715{
1716 Counter& c = get_counter(name);
1717 duration t =duration::zero();
1718 if (!c.is_running_)
1719 Process::exit("The counter is not running: " + c.description_);
1720 t = now() - c.last_open_time_;
1721 return (t.count());
1722}
1723
1725{
1726 Counter& c = get_counter(name);
1727 duration t = duration::zero();
1728 if (!c.is_running_)
1729 Process::exit("The counter is not running: " + c.description_);
1730 t = now() - c.last_open_time_;
1731 return (t.count());
1732}
1733
1735{
1736 if (last_opened_counter_==nullptr)
1737 Process::exit("You are trying to start the time loop before the start-up");
1738 time_loop_=true;
1739}
1740
1742{
1743 if (!time_loop_)
1744 Process::exit("The time loop has not started, but you are trying to end it");
1745 time_loop_=false;
1746}
1747
1749{
1750 assert (last_opened_counter_!=nullptr);
1751 time_point t = now();
1752 if (last_opened_counter_ == nullptr)
1753 Process::exit("You are trying to start a time step outside the time loop");
1754
1755 for (int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1756 {
1757 Counter& c_std = *std_counters_[i];
1758 c_std.time_ts_=duration::zero();
1759 }
1760 for (const auto & pair : custom_counter_map_str_to_counter_)
1761 {
1762 Counter& c_c = *pair.second;
1763 c_c.time_ts_=duration::zero();
1764 }
1765 Counter* c = last_opened_counter_;
1766 while (c->parent_ != nullptr && c->level_>=0)
1767 {
1768 c->open_time_ts_ = t;
1769 c = c->parent_;
1770 }
1771}
1772
1773/*! @brief Compute for each counter open during a time step avg_time_per_step_, min_time_per_step_, max_time_per_step_ and sd_time_per_step_
1774 *
1775 * Called at the end of each time step, and only then
1776 * @param tstep number of time steps elapsed since the start of the computation
1777 *
1778 * Nota : if the counter is called before the time step loop, it is not accounted for in the computation.
1779 */
1781{
1782 stop_counters_impl(); ///< stop_counters already updated c->tim_ts_
1783 if (last_opened_counter_ == nullptr)
1784 Process::exit("You are trying to compute the statistics of a time steps but have not open any counter");
1785 if (!time_loop_)
1786 Process::exit("You are trying to compute time loop statistics outside of the time loop");
1787 double step = static_cast<double>(tstep) - static_cast<double>(nb_steps_elapsed_);
1788 auto compute = [step](Counter& c)
1789 {
1790 if (c.level_>=0 && step>0 && c.count_>0)
1791 {
1792 double t = (c.time_ts_).count();
1793 c.min_time_per_step_ = std::min(c.min_time_per_step_, t);
1794 c.max_time_per_step_ = std::max(c.max_time_per_step_, t);
1795 c.avg_time_per_step_ = ((step-1)*c.avg_time_per_step_ + t)/step;
1796 c.var_time_per_step_ += (t - c.avg_time_per_step_ )*(t - c.avg_time_per_step_ )/static_cast<double>(step);
1797 if (c.var_time_per_step_ < 0)
1798 c.var_time_per_step_ = 0;
1799 }
1801 };
1802 if (end_cache_)
1803 {
1804 for (int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1805 {
1806 Counter& c_std = *std_counters_[i];
1807 compute(c_std);
1808 }
1809 if (!custom_counter_map_str_to_counter_.empty())
1810 {
1811 for (const auto & pair : custom_counter_map_str_to_counter_)
1812 {
1813 Counter& c = *pair.second;
1814 compute(c);
1815 }
1816 }
1817 }
1818 else
1819 {
1820 end_cache_ = tstep >= nb_steps_elapsed_;
1821 if (end_cache_)
1822 {
1823 time_skipped_ts_ = get_counter(STD_COUNTERS::timeloop).total_time_;
1824 computation_time_ += get_counter(STD_COUNTERS::total_execution_time).total_time_;
1826 }
1827 }
1829}
1830
1832{
1833 if (last_opened_counter_==nullptr)
1834 return -5;
1835 return last_opened_counter_->level_;
1836}
1837
1838void Perf_counters::Impl::print_TU_files_impl(const std::string& message)
1839{
1841 return;
1842 //Process::barrier();
1843 stop_counters_impl(); // will stop everything except highest level counter
1844
1845 // Also stop and update highest level counter
1846 Counter& c_time = get_counter(STD_COUNTERS::total_execution_time);
1847 auto time_elapsed_before_stop= now() - c_time.last_open_time_;
1848 c_time.total_time_ += time_elapsed_before_stop;
1849
1850 computation_time_ += c_time.total_time_;
1851 print_global_TU(message);
1852 print_performance_to_csv(message);
1854 // Also reset highest level counter:
1855 c_time.reset();
1856 counters_stop_=false;
1857}
1858
1860{
1861 if (gpu_timer_)
1862 Process::exit("You try to start the gpu timer and it is already running");
1863 gpu_timer_start_=now();
1864 gpu_timer_ = true;
1865}
1866
1868{
1869 if(!gpu_timer_)
1870 Process::exit("You try to stop the GPU timer, but it has not been started yet");
1871 gpu_timer_=false;
1872}
1873
1875{
1877 duration d= now() - gpu_timer_start_;
1878 return d.count();
1879}
1880
1882{
1883 return gpu_verbose_;
1884}
1886{
1887 gpu_verbose_ = on;
1888}
1890{
1891 return init_device_;
1892}
1894{
1895 init_device_=init;
1896}
1898{
1899 return gpu_timer_;
1900}
1902{
1903 gpu_timer_count_ += to_add;
1904}
1906{
1907 return gpu_timer_count_;
1908}
1909
1910//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1911// Methods of Perf_counters
1912//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
1913Perf_counters::Perf_counters() : pimpl_(std::make_unique<Impl>())
1914{
1915
1916}
1917
1918Perf_counters::~Perf_counters()=default;
1919
1920
1922{
1923 duration d= now() - start;
1924 {return d.count();}
1925}
1926
1927void Perf_counters::create_custom_counter(std::string counter_description , int counter_level, std::string counter_family , bool is_comm, bool is_gpu)
1928{
1929 pimpl_->create_custom_counter_impl(counter_description , counter_level, counter_family , is_comm, is_gpu);
1930}
1931
1932void Perf_counters::begin_count(const STD_COUNTERS& std_cnt, int counter_lvl)
1933{
1934 pimpl_->begin_count_impl(std_cnt,counter_lvl);
1935}
1936
1937void Perf_counters::begin_count(const std::string& custom_count_name, int counter_lvl)
1938{
1939 pimpl_->begin_count_impl(custom_count_name,counter_lvl);
1940}
1941
1942void Perf_counters::end_count(const std::string& custom_count_name, int count_increment, long int quantity_increment)
1943{
1944 pimpl_->end_count_impl(custom_count_name,count_increment,quantity_increment);
1945}
1946
1947void Perf_counters::end_count(const STD_COUNTERS& std_cnt, int count_increment, long int quantity_increment)
1948{
1949 pimpl_->end_count_impl(std_cnt,count_increment,quantity_increment);
1950}
1951
1953{
1954 pimpl_->stop_counters_impl();
1955}
1956
1958{
1959 pimpl_->restart_counters_impl();
1960}
1961
1963{
1964 pimpl_->reset_counters_impl();
1965}
1966
1967void Perf_counters::print_TU_files(const std::string& message)
1968{
1969 pimpl_->print_TU_files_impl(message);
1970}
1971
1973{
1974 return pimpl_->get_computation_time_impl();
1975}
1976
1977double Perf_counters::get_total_time(const STD_COUNTERS& name)
1978{
1979 return pimpl_->get_total_time_impl(name);
1980}
1981
1982double Perf_counters::get_total_time(const std::string& name)
1983{
1984 return pimpl_->get_total_time_impl(name);
1985}
1986
1987double Perf_counters::get_time_since_last_open(const STD_COUNTERS& name)
1988{
1989 return pimpl_->get_time_since_last_open_impl(name);
1990}
1991
1992double Perf_counters::get_time_since_last_open(const std::string& name)
1993{
1994 return pimpl_->get_time_since_last_open_impl(name);
1995}
1996
1997bool Perf_counters::is_running(const STD_COUNTERS& name)
1998{
1999 return pimpl_->running_impl(name);
2000}
2001
2003{
2004 pimpl_->start_timeloop_impl();
2005}
2006
2008{
2009 pimpl_->end_timeloop_impl();
2010}
2011
2013{
2014 pimpl_->start_time_step_impl();
2015}
2016
2018{
2019 pimpl_->end_time_step_impl(tstep);
2020}
2021
2023{
2024 pimpl_->set_time_steps_elapsed_impl(n);
2025}
2026
2028{
2029 return pimpl_->get_last_opened_counter_level_impl();
2030}
2031
2032void Perf_counters::record_nb_elem(trustIdType nb_elem)
2033{
2034 pimpl_->record_nb_elem_impl(nb_elem);
2035}
2036
2038{
2039 pimpl_->start_gpu_timer_impl();
2040}
2041
2043{
2044 pimpl_->stop_gpu_timer_impl();
2045}
2046
2048{
2049 return pimpl_->is_gpu_verbose_on_impl();
2050}
2051
2053{
2054 pimpl_->set_gpu_verbose_impl(on);
2055}
2056
2058{
2059 return pimpl_->get_init_device_impl();
2060}
2061
2063{
2064 pimpl_->set_init_device_impl(init);
2065}
2066
2068{
2069 return pimpl_->get_gpu_timer_impl();
2070}
2071
2073{
2074 pimpl_->add_to_gpu_timer_counter_impl(to_add);
2075}
2076
2078{
2079 return pimpl_->get_gpu_timer_counter_impl();
2080}
2081
2083{
2084 return pimpl_->compute_gpu_time_impl();
2085}
2086
2087bool Perf_counters::get_use_gpu() const {return pimpl_->get_use_gpu_impl();}
2088
2089bool Perf_counters::get_gpu_fence() const {return pimpl_->get_gpu_fence_impl();}
2090
2091void Perf_counters::set_gpu_fence(bool fence) {return pimpl_->set_gpu_fence_impl(fence);}
static bool disable_TU
Flag to disable or not the writing of the .TU files.
Definition Objet_U.h:125
static bool stat_per_proc_perf_log
Flag to enable the writing of the statistics detailed per processor in _csv.TU file.
Definition Objet_U.h:126
static const Nom & nom_du_cas()
Renvoie une reference constante vers le nom du cas.
Definition Objet_U.cpp:146
bool get_init_device_impl() const
void print_TU_files_impl(const std::string &message)
void restart_counters_impl()
Restart all counters, has to be called on every processor simultaneously.
bool is_gpu_verbose_on_impl() const
int get_last_opened_counter_level_impl() const
bool get_gpu_timer_impl() const
int get_gpu_timer_counter_impl() const
void stop_counters_impl()
Stop all counters, has to be called on every processor simultaneously.
void begin_count_impl(const STD_COUNTERS &std_cnt, int counter_lvl)
void set_gpu_fence_impl(bool fence)
void create_custom_counter_impl(std::string counter_description, int counter_level, std::string counter_family, bool is_comm, bool is_gpu)
void end_count_impl(const STD_COUNTERS &std_cnt, int count_increment, long int quantity_increment)
End the count of a counter and update the counter values.
bool get_gpu_fence_impl() const
double get_total_time_impl(const STD_COUNTERS &name)
void set_time_steps_elapsed_impl(int time_step_elapsed)
double get_time_since_last_open_impl(const STD_COUNTERS &name)
bool running_impl(const STD_COUNTERS name)
void record_nb_elem_impl(trustIdType nb_elem)
bool get_use_gpu_impl() const
void add_to_gpu_timer_counter_impl(int to_add=1)
void set_gpu_verbose_impl(bool on)
void end_time_step_impl(long int tstep)
Compute for each counter open during a time step avg_time_per_step_, min_time_per_step_,...
void set_init_device_impl(bool init)
std::chrono::time_point< clock > time_point
std::chrono::duration< double > duration
std::chrono::high_resolution_clock clock
int get_last_opened_counter_level() const
bool is_running(const STD_COUNTERS &name)
Check whether a counter is already running. Should rarely be used!
std::chrono::time_point< clock > time_point
int get_gpu_timer_counter() const
void end_time_step(long int tstep)
This function compute statistics per time steps of counters used at least once during a time step.
double stop_gpu_timer_and_compute_gpu_time()
void start_timeloop()
Set time_loop_ to true in order to account for cache properly.
Perf_counters(const Perf_counters &)=delete
void set_gpu_verbose(bool on)
void set_gpu_fence(bool fence)
time_point now()
bool get_gpu_timer() const
std::chrono::duration< double > duration
void record_nb_elem(trustIdType nb_elem)
void begin_count(const STD_COUNTERS &std_cnt, int counter_lvl=-100000)
void set_init_device(bool init)
bool get_gpu_fence() const
void set_nb_time_steps_elapsed(int n)
double get_time_since_last_open(const STD_COUNTERS &name)
Give as a double the time (in second) elapsed in the operation tracked by the standard counter call n...
void add_to_gpu_timer_counter(int to_add)
void stop_counters()
Stop all counters, has to be called on every processor simultaneously.
bool get_use_gpu() const
double get_total_time(const STD_COUNTERS &name)
Give as a double the total time (in second) elapsed in the operation tracked by the standard counter ...
bool get_init_device() const
void restart_counters()
Restart all counters, has to be called on every processor simultaneously.
void end_timeloop()
Set time_loop_ to false as we exit the time loop.
void reset_counters()
Reset counters to zero, used between the start-up of the computation, the computation itself and the ...
double compute_time(time_point start)
return time since start in seconds
void print_TU_files(const std::string &message)
Function that encapsulate the two functions that writes the TU files.
void end_count(const std::string &custom_count_name, int count_increment=1, long int quantity_increment=0)
End the count of a counter and update the counter values.
void create_custom_counter(std::string counter_description, int counter_level, std::string counter_family="None", bool is_comm=false, bool is_gpu=false)
Create a new counter and add it to the map of custom counters.
void start_time_step()
, this function start statistics tracking for a time step. It has to be called at the start of each t...
bool is_gpu_verbose_on() const
double get_computation_time()
Update computation_time_ and return its value as a double (in seconds).
static double mp_min(double)
Definition Process.cpp:386
static double mp_max(double)
Definition Process.cpp:376
static bool is_parallel()
Definition Process.cpp:110
static int nproc()
renvoie le nombre de processeurs dans le groupe courant Voir Comm_Group::nproc() et PE_Groups::curren...
Definition Process.cpp:104
static double mp_sum(double)
Calcule la somme de x sur tous les processeurs du groupe courant.
Definition Process.cpp:146
static void barrier()
Synchronise tous les processeurs du groupe courant (attend que tous les processeurs soient arrives a ...
Definition Process.cpp:136
static int me()
renvoie mon rang dans le groupe de communication courant.
Definition Process.cpp:125
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
Definition Process.cpp:455
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
Definition Process.cpp:86
std::string model
long int num_threads
std::chrono::duration< double > duration
double max_time_per_step_
double avg_time_per_step_
double var_time_per_step_
time_point last_open_time_alone_
bool running_() const
const bool is_comm_
Counter * parent_
void end_count_(int count_increment, long int quantity_increment, time_point t_stop)
time_point open_time_ts_
double min_time_per_step_
const bool is_gpu_
double get_time_() const
time_point last_open_time_
long int quantity_
std::array< std::array< double, 4 >, 4 > compute_min_max_avg_sd_() const
update variables : avg_time_per_step_ , min_time_per_step_ , max_time_per_step_ , sd_time_per_step_
Counter(int counter_level, std::string counter_name, std::string counter_family="None", bool is_comm=false, bool is_gpu=false)
const std::string description_
std::chrono::high_resolution_clock clock
const std::string family_
time_point now()
duration time_ts_
void begin_count_(int counter_level, time_point t)
void set_parent(Counter *parent_counter)
duration time_alone_
std::chrono::time_point< clock > time_point
duration total_time_
std::string name
std::string runtime_version
std::string driver_version