16#include <Perf_counters.h>
26#include <sys/utsname.h>
32#include <EntreeSortie.h>
34#include <EcrFicPartage.h>
37#include <TRUST_Version.h>
43#include <nvtx3/nvToolsExt.h>
44#include <cuda_runtime.h>
45#define gpuDeviceProp_t cudaDeviceProp
46#define gpuGetDevice cudaGetDevice
47#define gpuGetDeviceProperties cudaGetDeviceProperties
48#define gpuDriverGetVersion cudaDriverGetVersion
49#define gpuRuntimeGetVersion cudaRuntimeGetVersion
50#define VERSION_DIVISOR 1000
51#define VERSION_MOD 100
52#define GPU_SUCCESS cudaSuccess
55#include <rocprofiler-sdk-roctx/roctx.h>
56#include <hip/hip_runtime.h>
57#define gpuDeviceProp_t hipDeviceProp_t
58#define gpuGetDevice hipGetDevice
59#define gpuGetDeviceProperties hipGetDeviceProperties
60#define gpuDriverGetVersion hipDriverGetVersion
61#define gpuRuntimeGetVersion hipRuntimeGetVersion
62#define VERSION_DIVISOR 10000000
63#define VERSION_MOD 100000
64#define GPU_SUCCESS hipSuccess
66#define MINFLOAT 1.e-34
93 using clock = std::chrono::high_resolution_clock;
95 using duration = std::chrono::duration<double>;
101 Counter(
int counter_level, std::string counter_name, std::string counter_family =
"None",
bool is_comm =
false,
bool is_gpu =
false);
143Counter::Counter(
int counter_level, std::string counter_name, std::string counter_family ,
bool is_comm,
bool is_gpu)
162 if (counter_level !=
level_)
193 count_ += count_increment;
214 double qty,cnt,min,max,avg,sd ;
216 cnt =
static_cast<double>(
count_);
218 auto l_compute =[&min, &max,&avg,&sd] (
double value)
224 std::array<double,4> result = {min,max,avg,sd};
228 std::array<double,4> min_max_avg_sd_time = l_compute(
total_time_.count());
230 std::array<double,4> min_max_avg_sd_quantity = l_compute(qty);
232 std::array<double,4> min_max_avg_sd_count_ = l_compute(cnt);
234 std::array<double,4> min_max_avg_sd_time_alone_ = l_compute(
time_alone_.count());
236 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count_,min_max_avg_sd_time_alone_ };
266 using clock = std::chrono::high_resolution_clock;
275 void create_custom_counter_impl(std::string counter_description,
int counter_level, std::string counter_family,
bool is_comm,
bool is_gpu);
277 void begin_count_impl(
const std::string& custom_count_name,
int counter_lvl);
278 void end_count_impl(
const STD_COUNTERS& std_cnt,
int count_increment,
long int quantity_increment);
279 void end_count_impl(
const std::string& custom_count_name,
int count_increment,
long int quantity_increment);
308 bool running_impl(
const STD_COUNTERS name) {
return get_counter(name).running_(); }
312 Counter& get_counter(
const STD_COUNTERS name) ;
313 Counter& get_counter(
const std::string name);
316 double compute_allreduce_peak();
317 std::string get_os()
const;
320 std::string get_date()
const;
321 void print_performance_to_csv(
const std::string& message);
322 void print_global_TU(
const std::string& message);
323 std::array<std::unique_ptr<Counter>,
static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER)> std_counters_;
324 std::map<std::string, std::unique_ptr<Counter>> custom_counter_map_str_to_counter_;
325 bool end_cache_=
false;
326 bool time_loop_=
false;
327 bool counters_stop_=
false;
329 duration computation_time_=duration::zero();
330 duration time_skipped_ts_=duration::zero();
331 Counter* last_opened_counter_=
nullptr;
333 int nb_steps_elapsed_=1;
335 int nb_steps_elapsed_=0;
337 int total_nb_backup_=0;
338 double total_data_exchange_per_backup_=0.;
339 bool gpu_verbose_ =
false;
340 bool init_device_ =
false;
341 bool gpu_timer_ =
false;
343 bool gpu_fence_=
true;
345 int gpu_timer_count_=0;
346 int max_str_length_=118;
347 trustIdType nb_elem_tot_=0;
354 std_counters_[
static_cast<int>(STD_COUNTERS::total_execution_time)] = std::make_unique<Counter>(-1,
"Total time");
355 std_counters_[
static_cast<int>(STD_COUNTERS::computation_start_up)] = std::make_unique<Counter>(0,
"Computation start-up");
356 std_counters_[
static_cast<int>(STD_COUNTERS::timeloop)] = std::make_unique<Counter>(0,
"Time loop");
357 std_counters_[
static_cast<int>(STD_COUNTERS::backup_file)] = std::make_unique<Counter>(0,
"Back-up operations");
358 std_counters_[
static_cast<int>(STD_COUNTERS::system_solver)] = std::make_unique<Counter>(1,
"Linear solver resolutions Ax=B");
359 std_counters_[
static_cast<int>(STD_COUNTERS::petsc_solver)] = std::make_unique<Counter>(2,
"Petsc solver");
360 std_counters_[
static_cast<int>(STD_COUNTERS::matrix_assembly)] = std::make_unique<Counter>(1,
"Matrix assembly for implicit scheme");
361 std_counters_[
static_cast<int>(STD_COUNTERS::ajouter_blocs)] = std::make_unique<Counter>(1,
"Call to ::ajouter_blocs for matrix assembly");
362 std_counters_[
static_cast<int>(STD_COUNTERS::implicit_diffusion)] = std::make_unique<Counter>(1,
"Solver for implicit diffusion");
363 std_counters_[
static_cast<int>(STD_COUNTERS::compute_dt)] = std::make_unique<Counter>(1,
"Computation of the time step dt");
364 std_counters_[
static_cast<int>(STD_COUNTERS::turbulent_viscosity)] = std::make_unique<Counter>(1,
"Turbulence model::update");
365 std_counters_[
static_cast<int>(STD_COUNTERS::convection)] = std::make_unique<Counter>(1,
"Convection operator");
366 std_counters_[
static_cast<int>(STD_COUNTERS::diffusion)] = std::make_unique<Counter>(1,
"Diffusion operator");
367 std_counters_[
static_cast<int>(STD_COUNTERS::gradient)] = std::make_unique<Counter>(1,
"Gradient operator");
368 std_counters_[
static_cast<int>(STD_COUNTERS::divergence)] = std::make_unique<Counter>(1,
"Divergence operator");
369 std_counters_[
static_cast<int>(STD_COUNTERS::source_terms)] = std::make_unique<Counter>(1,
"Source terms");
370 std_counters_[
static_cast<int>(STD_COUNTERS::postreatment)] = std::make_unique<Counter>(1,
"Post-treatment operations");
371 std_counters_[
static_cast<int>(STD_COUNTERS::restart)] = std::make_unique<Counter>(1,
"Read file for restart");
372 std_counters_[
static_cast<int>(STD_COUNTERS::update_variables)] = std::make_unique<Counter>(1,
"Update ::mettre_a_jour");
374 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_sendrecv)] = std::make_unique<Counter>(2,
"MPI_send_recv",
"MPI_sendrecv",
true);
375 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_send)] = std::make_unique<Counter>(2,
"MPI_send",
"MPI_sendrecv",
true);
376 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_recv)] = std::make_unique<Counter>(2,
"MPI_recv",
"MPI_sendrecv",
true);
377 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_bcast)] = std::make_unique<Counter>(2,
"MPI_broadcast",
"MPI_sendrecv",
true);
378 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_alltoall)] = std::make_unique<Counter>(2,
"MPI_alltoall",
"MPI_sendrecv",
true);
379 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_allgather)] = std::make_unique<Counter>(2,
"MPI_allgather",
"MPI_sendrecv",
true);
380 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_gather)] = std::make_unique<Counter>(2,
"MPI_gather",
"MPI_sendrecv",
true);
381 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_partialsum)] = std::make_unique<Counter>(2,
"MPI_partialsum",
"MPI_allreduce",
true);
382 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_sumdouble)] = std::make_unique<Counter>(2,
"MPI_sumdouble",
"MPI_allreduce",
true);
383 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_mindouble)] = std::make_unique<Counter>(2,
"MPI_mindouble",
"MPI_allreduce",
true);
384 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_maxdouble)] = std::make_unique<Counter>(2,
"MPI_maxdouble",
"MPI_allreduce",
true);
385 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_sumfloat)] = std::make_unique<Counter>(2,
"MPI_sumfloat",
"MPI_allreduce",
true);
386 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_minfloat)] = std::make_unique<Counter>(2,
"MPI_minfloat",
"MPI_allreduce",
true);
387 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_maxfloat)] = std::make_unique<Counter>(2,
"MPI_maxfloat",
"MPI_allreduce",
true);
388 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_sumint)] = std::make_unique<Counter>(2,
"MPI_sumint",
"MPI_allreduce",
true);
389 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_minint)] = std::make_unique<Counter>(2,
"MPI_minint",
"MPI_allreduce",
true);
390 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_maxint)] = std::make_unique<Counter>(2,
"MPI_maxint",
"MPI_allreduce",
true);
391 std_counters_[
static_cast<int>(STD_COUNTERS::mpi_barrier)] = std::make_unique<Counter>(2,
"MPI_barrier",
"MPI_allreduce",
true);
394 std_counters_[
static_cast<int>(STD_COUNTERS::gpu_library)] = std::make_unique<Counter>(2,
"GPU_library",
"GPU_library",
false,
true);
395 std_counters_[
static_cast<int>(STD_COUNTERS::gpu_kernel)] = std::make_unique<Counter>(2,
"GPU_kernel",
"GPU_kernel",
false,
true);
396 std_counters_[
static_cast<int>(STD_COUNTERS::gpu_copytodevice)] = std::make_unique<Counter>(2,
"GPU_copyToDevice",
"GPU_copy",
false,
true);
397 std_counters_[
static_cast<int>(STD_COUNTERS::gpu_copyfromdevice)] = std::make_unique<Counter>(2,
"GPU_copyFromDevice",
"GPU_copy",
false,
true);
398 std_counters_[
static_cast<int>(STD_COUNTERS::gpu_malloc_free)] = std::make_unique<Counter>(2,
"GPU_allocations" ,
"GPU_alloc",
false,
true);
400 std_counters_[
static_cast<int>(STD_COUNTERS::interprete_scatter)] = std::make_unique<Counter>(2,
"Scatter_interprete",
"None",
true,
false);
401 std_counters_[
static_cast<int>(STD_COUNTERS::virtual_swap)] = std::make_unique<Counter>(2,
"DoubleVect/IntVect::virtual_swap",
"None",
true);
402 std_counters_[
static_cast<int>(STD_COUNTERS::read_scatter)] = std::make_unique<Counter>(2,
"Scatter::read_domaine",
"None",
true);
404 std_counters_[
static_cast<int>(STD_COUNTERS::parallel_meshing)] = std::make_unique<Counter>(0,
"Parallel meshing");
406 std_counters_[
static_cast<int>(STD_COUNTERS::IO_EcrireFicPartageBin)] = std::make_unique<Counter>(2,
"write",
"IO");
407 std_counters_[
static_cast<int>(STD_COUNTERS::IO_EcrireFicPartageMPIIO)] = std::make_unique<Counter>(2,
"MPI_File_write_all",
"IO");
408 if (nb_steps_elapsed_==0)
416Counter& Perf_counters::Impl::get_counter(
const STD_COUNTERS name)
418 return *std_counters_[
static_cast<int>(name)];
421Counter& Perf_counters::Impl::get_counter(std::string cust_counter_desc)
423 if (custom_counter_map_str_to_counter_.count(cust_counter_desc)==0)
424 Process::exit(
"You are trying to find a custom counter that does not exists");
425 return *custom_counter_map_str_to_counter_.at(cust_counter_desc);
428void Perf_counters::Impl::check_begin(Counter& c,
int counter_lvl,
time_point t)
430 if (last_opened_counter_ !=
nullptr)
440 int expected_lvl = last_opened_counter_->level_ +1;
443 counter_lvl=expected_lvl;
445 if (counter_lvl != expected_lvl)
447 std::ostringstream error_msg ;
448 error_msg <<
"The counter you are trying to start does not have the expected level, counter running: " << last_opened_counter_->description_ <<
" counter that you try to open: " << c.
description_ <<
" ; expected level: "<< expected_lvl << std::endl ;
457 last_opened_counter_ =&c;
460void Perf_counters::Impl::check_end(Counter& c,
time_point t)
462 if (!c.
is_running_ || last_opened_counter_==
nullptr)
464 if (last_opened_counter_ != &c)
466 std::string error_msg =
"The counter you are trying to close is not the last opened, counter: " + c.
description_;
474 last_opened_counter_ = c.
parent_;
481double Perf_counters::Impl::compute_allreduce_peak()
486 for (i = 0; i < 100; i++)
490 double allreduce_peak_perf = time.count();
495std::string delete_blank_spaces(std::string str)
527std::string Perf_counters::Impl::get_os()
const
530 struct utsname buffer;
531 if (uname(&buffer) == -1)
532 return "Error: Unable to retrieve OS info";
533 result += std::string(buffer.nodename) +
"__";
534 result += std::string(buffer.sysname) +
"__";
535 result += std::string(buffer.machine) +
"__";
536 result += std::string(buffer.release)+
"__";
537 result += std::string(buffer.version);
538 result = delete_blank_spaces(result);
539 return result.substr(0,max_str_length_);
549CPUInfo Perf_counters::Impl::get_cpu()
const
552 info.
num_threads = std::thread::hardware_concurrency();
553#if defined(__APPLE__)
554 info.
model =
"Apple";
556#elif defined(__CYGWIN__)
557 info.
model =
"Cygwin";
559#elif defined(__linux__)
560 std::ifstream cpuinfo(
"/proc/cpuinfo");
564 while (std::getline(cpuinfo, line))
566 if (line.find(
"model name") != std::string::npos)
568 size_t pos = line.find(
':');
569 if (pos != std::string::npos)
571 info.
model = line.substr(pos + 2);
577 if (info.
model.empty())
579 info.
model =
"Unknown Linux CPU";
582 info.
model =
"Unknown CPU";
591GPUInfo Perf_counters::Impl::get_gpu()
const
596#if !(defined(TRUST_USE_ROCM) || defined(TRUST_USE_CUDA))
597#error "Neither CUDA nor HIP macros defined, but TRUST_USE_GPU is defined! Something's wrong."
600 gpuDeviceProp_t prop;
602 int driverVersion, runtimeVersion;
604 auto err1=gpuGetDevice(&device);
605 if(err1!=GPU_SUCCESS)
606 Cerr<<
"Failed to get GPU device model"<<std::endl;
607 auto err2=gpuGetDeviceProperties(&prop, device);
608 if(err2!=GPU_SUCCESS)
609 Cerr<<
"Failed to get GPU device properties"<<std::endl;
610 auto err3=gpuDriverGetVersion(&driverVersion);
611 if(err3!=GPU_SUCCESS)
612 Cerr<<
"Failed to get GPU driver version"<<std::endl;
613 auto err4=gpuRuntimeGetVersion(&runtimeVersion);
614 if(err4==GPU_SUCCESS)
615 Cerr<<
"Failed to get GPU runtime version"<<std::endl;
617 info.
name = std::string(prop.name);
619 std::ostringstream runtime_stream;
620 runtime_stream << (runtimeVersion / VERSION_DIVISOR) <<
"."
621 << ((runtimeVersion % VERSION_MOD) / (VERSION_MOD / 100));
624 std::ostringstream driver_stream;
625 driver_stream << (driverVersion / VERSION_DIVISOR) <<
"."
626 << ((driverVersion % VERSION_MOD) / (VERSION_MOD / 100));
636std::string Perf_counters::Impl::get_date()
const
638 time_t
now = time(0);
639 std::ostringstream date;
640 struct tm tstruct = *localtime(&
now);
641 date<< std::put_time(&tstruct,
"%d-%m-%Y -- %X");
642 std::string result = date.str();
643 result = delete_blank_spaces(result);
644 return (result.substr(0,max_str_length_));
647static void build_line_csv(std::ostringstream& lines,
const std::array<std::string,24>& line_items,
const std::array<int,24>& item_size)
649 int size_of_str_to_add = 50;
650 long long int len_line = line_items.size();
651 for (
long long int i=0 ; i<len_line ; i++)
653 size_of_str_to_add = item_size[i];
654 lines << std::setw(size_of_str_to_add) ;
655 lines << line_items[i];
656 if (i == len_line -1)
663void clean_stringstream(std::ostringstream& lines)
669void Perf_counters::Impl::print_performance_to_csv(
const std::string& message)
671 assert(!message.empty());
672 std::ostringstream perfs;
673 std::ostringstream perfs_globales;
674 std::ostringstream file_header;
676 const int length_line = 24;
677 std::array<int,length_line> item_size;
678 for (
int& j:item_size)
680 std::array<std::string,length_line> line_items;
681 for (std::string& str :line_items)
683 std::ostringstream tmp_item;
693 CPUInfo cpu = get_cpu();
694 file_header <<
"# Detailed performance log file for case: " <<
Objet_U::nom_du_cas()<<
". See the associated validation form for an example of data analysis"<< std::endl;
695 file_header <<
"# Date of the computation: " << get_date() << std::endl;
696 file_header <<
"# OS used: " << get_os() << std::endl;
697 file_header <<
"# CPU model: " << cpu.
model << std::endl;
698 file_header <<
"# Total number of threads:" << cpu.
num_threads << std::endl;
701 GPUInfo gpu = get_gpu();
702 file_header <<
"# GPU model: " << gpu.
name << std::endl;
704 file_header <<
"# CUDA runtime version: " << gpu.
runtime_version << std::endl;
705 file_header <<
"# CUDA drivers version: " << gpu.
driver_version << std::endl;
708 file_header <<
"# HIP runtime version: " << gpu.
runtime_version << std::endl;
709 file_header <<
"# HIP drivers version: " << gpu.
driver_version << std::endl;
713 file_header <<
"# GPU model: "<<
"No GPU used for the computation" << std::endl;
714 file_header <<
"# Number of processor used = " << nb_procs << std::endl;
715 file_header <<
"# Total number of elements used for the calculation: " << nb_elem_tot << std::endl;
716 file_header <<
"# The time was measured by the following method using std::chrono::high_resolution_clock::now() and is printed in seconds" << std::endl ;
717 file_header <<
"# By default, only averaged statistics on all processor are printed. For accessing the detail per processor, add 'stat_per_proc_perf_log 1' in the data file"<< std::endl;
718 file_header <<
"# Processor number equal to -1 corresponds to the performance of the calculation averaged on the processors during the simulation step" << std::endl;
719 file_header <<
"# If a counter does not belong in any particular family, then counter family is set to None" << std::endl;
720 file_header <<
"# Count means the number of time the counter is called during the overall calculation step." << std::endl;
721 file_header <<
"# Min, max and SD accounts respectively for the minimum, maximum and Standard Deviation of the quantity of the previous row." << std::endl;
722 file_header <<
"# Quantity is a custom variable that depends on the counter. It is used to compute bandwidth for communication counters for example. See the table at the end of the introduction on statistics in TRUST form for more details." << std::endl;
723 file_header <<
"# To retrieve the time not tracked by any counter of level 1 or higher, sum the 'time alone' value of counters of level -1 and 0." << std::endl;
724 file_header <<
"#" << std::endl <<
"#" << std::endl;
726 line_items[0] =
"Overall_simulation_step";
727 line_items[1] =
"Processor_Number";
728 line_items[2] =
"Counter_family";
729 line_items[3] =
"Counter_name";
730 line_items[4] =
"Counter_level";
731 line_items[5] =
"Is_comm";
732 line_items[6] =
"%_total_time";
733 line_items[7] =
"total time";
734 line_items[8] =
"t_min";
735 line_items[9] =
"t_max";
736 line_items[10] =
"t_SD";
737 line_items[11] =
"time alone";
738 line_items[12] =
"t_alone_min";
739 line_items[13] =
"t_alone_max";
740 line_items[14] =
"t_alone_SD";
741 line_items[15] =
"count";
742 line_items[16] =
"time_per_step";
743 line_items[17] =
"tps_min";
744 line_items[18] =
"tps_max";
745 line_items[19] =
"tps_SD";
746 line_items[20] =
"Quantity";
747 line_items[21] =
"q_min";
748 line_items[22] =
"q_max";
749 line_items[23] =
"q_SD";
751 assert(item_size.size()==length_line);
752 assert(line_items.size()==item_size.size());
754 build_line_csv(file_header,line_items,item_size);
758 bool skip_globals =
false;
759 int total_nb_of_counters =
static_cast<int>(std_counters_.size()) +
static_cast<int>(custom_counter_map_str_to_counter_.size());
760 int min_total_nb_of_counters = total_nb_of_counters;
761 int max_total_nb_of_counters = total_nb_of_counters;
763 if ( (max_total_nb_of_counters - min_total_nb_of_counters)!=0 )
767 perfs_globales <<
"Unable to collect statistics :" << std::endl
768 <<
" there is not the same number of counters on all"
769 " processors."<< std::endl;
773 Counter& c_time = get_counter(STD_COUNTERS::timeloop);
774 int nb_ts = c_time.
count_- nb_steps_elapsed_;
776 if (time_loop_ && nb_ts <= 0)
780 perfs_globales <<
"The computation is shorter than cache" << std::endl;
788 long int quantity, min_quantity=0, max_quantity=0;
789 double time,time_alone,min_time_alone=0.,max_time_alone=0.,SD_time_alone=0.0;
790 double percent_time=0., min_time=0.0, max_time=0.0;
791 double SD_time=0.0, SD_quantity=0.0;
792 double avg_time_per_step=0., min_time_per_step=0., max_time_per_step=0., sd_time_per_step=0.;
794 auto fill_items = [&](
int proc_number,
const std::string desc,
const std::string familly)
797 line_items[0] = tmp_item.str();
801 tmp_item<< proc_number;
802 line_items[1] = tmp_item.str();
806 line_items[2] = tmp_item.str();
810 line_items[3] = tmp_item.str();
814 line_items[4] = tmp_item.str();
818 line_items[5] = tmp_item.str();
821 tmp_item<< std::setprecision(4);
822 tmp_item<< percent_time;
823 line_items[6] = tmp_item.str();
826 tmp_item << std::scientific << std::setprecision(7);
828 line_items[7] = tmp_item.str();
832 line_items[8] = tmp_item.str();
836 line_items[9] = tmp_item.str();
840 line_items[10] = tmp_item.str();
843 tmp_item<< time_alone;
844 line_items[11] = tmp_item.str();
847 tmp_item<< min_time_alone;
848 line_items[12] = tmp_item.str();
851 tmp_item<< max_time_alone;
852 line_items[13] = tmp_item.str();
855 tmp_item<< SD_time_alone;
856 line_items[14] = tmp_item.str();
860 line_items[15] = tmp_item.str();
863 tmp_item<< avg_time_per_step;
864 line_items[16] = tmp_item.str();
867 tmp_item<< min_time_per_step;
868 line_items[17] = tmp_item.str();
871 tmp_item<< max_time_per_step;
872 line_items[18] = tmp_item.str();
875 tmp_item<< sqrt(std::max(0., sd_time_per_step));
876 line_items[19] = tmp_item.str();
880 line_items[20] = tmp_item.str();
883 tmp_item<< min_quantity;
884 line_items[21] = tmp_item.str();
887 tmp_item<< max_quantity;
888 line_items[22] = tmp_item.str();
891 tmp_item<< SD_quantity;
892 line_items[23] = tmp_item.str();
896 auto extract_stats = [&](
const Counter & c_lambda)
898 level = c_lambda.level_;
899 is_comm = c_lambda.is_comm_;
900 time = c_lambda.total_time_.count();
901 time_alone = c_lambda.time_alone_.count();
902 count = c_lambda.count_;
903 quantity = c_lambda.quantity_;
904 avg_time_per_step = c_lambda.avg_time_per_step_;
905 min_time_per_step = c_lambda.min_time_per_step_;
906 max_time_per_step = c_lambda.max_time_per_step_;
907 sd_time_per_step = std::sqrt(std::max(0., c_lambda.var_time_per_step_));
919 fill_items(
Process::me(),c_lambda.description_, c_lambda.family_);
920 build_line_csv(perfs,line_items,item_size);
924 std::array< std::array<double,4> ,4> table = c_lambda.compute_min_max_avg_sd_();
928 min_time = table[0][0];
929 max_time = table[0][1];
930 SD_time = table[0][3];
931 quantity =
static_cast <long int>(std::floor(table[1][2]));
932 min_quantity =
static_cast <long int>(std::floor(table[1][0]));
933 max_quantity =
static_cast <long int>(std::floor(table[1][1]));
934 SD_quantity = table[1][3];
935 time_alone = table[3][2];
936 min_time_alone = table[3][0];
937 max_time_alone = table[3][1];
938 SD_time_alone = table[3][3];
941 fill_items(-1,c_lambda.description_,c_lambda.family_);
942 build_line_csv(perfs_globales,line_items,item_size);
948 for (
int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
950 Counter& c_std = *std_counters_[i];
951 extract_stats(c_std);
954 for (
const auto & pair : custom_counter_map_str_to_counter_)
956 if (pair.second!=
nullptr)
957 extract_stats(*pair.second);
962 auto printing_mode = ios::app;
963 if (message==
"Computation start-up statistics")
964 printing_mode= ios::out;
967 SFichier file(CSV, printing_mode);
968 file << file_header.str();
969 file << perfs_globales.str();
976 EcrFicPartage file(CSV,ios::app);
980 clean_stringstream(file_header);
981 clean_stringstream(perfs);
982 clean_stringstream(perfs_globales);
994inline std::array< std::array<double,4> ,3> compute_min_max_avg_sd(
double& time,
long int& quantity,
int& count)
996 double qty,cnt,min,max,avg,sd ;
997 qty=
static_cast<double>(quantity);
998 cnt =
static_cast<double>(count);
1001 auto l_compute =[&min, &max,&avg,&sd] (
double value)
1007 std::array<double,4> result = {min,max,avg,sd};
1010 std::array<double,4> min_max_avg_sd_time = l_compute(time);
1012 std::array<double,4> min_max_avg_sd_quantity = l_compute(qty);
1013 std::array<double,4> min_max_avg_sd_count = l_compute(cnt);
1014 count =
static_cast<int>(std::floor (avg));
1015 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count};
1019 std::array<double,4> min_max_avg_sd_time = {time,0.,0.,0.};
1020 std::array<double,4> min_max_avg_sd_quantity = {qty,0.,0.,0.};
1021 std::array<double,4> min_max_avg_sd_count = {cnt,0.,0.,0.};
1022 return {min_max_avg_sd_time,min_max_avg_sd_quantity,min_max_avg_sd_count};
1026template <
typename... Args>
1027std::string fmt(
const char* format, Args... args)
1030 std::snprintf(buf,
sizeof(buf), format, args...);
1031 return std::string(buf);
1039void Perf_counters::Impl::print_global_TU(
const std::string& message)
1041 assert(!message.empty());
1042 std::ostringstream perfs_TU;
1043 std::ostringstream perfs_GPU;
1044 std::ostringstream perfs_IO;
1045 std::ostringstream captions;
1046 std::ostringstream file_header;
1047 const int counter_description_width = 40;
1048 const int time_per_step_width= 15;
1049 const int percent_loop_time_width=11;
1050 const int count_per_ts_width=15;
1051 const int level_width=5;
1052 const int bandwith_width= 10;
1053 const int tabular_custom_line_width= counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width+3+level_width;
1054 const int cpu_line_width=counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width;
1055 const int gpu_line_width=counter_description_width+3+time_per_step_width+3+percent_loop_time_width+3+count_per_ts_width+3+bandwith_width+4;
1056 const int number_width=15;
1057 const int text_width =cpu_line_width-count_per_ts_width;
1058 const int header_txt_width = 10;
1059 const int message_width =
static_cast<int>(message.length());
1060 const std::string separator =
" | ";
1061 const std::string line_sep_cpu(max_str_length_,
'~');
1062 const std::string line_sep_tabular(cpu_line_width,
'-');
1063 const std::string line_sep_tabular_custom(tabular_custom_line_width,
'-');
1064 const std::string line_sep_gpu(gpu_line_width,
'-');
1066 double allreduce_peak_perf = compute_allreduce_peak();
1067 double comm_allreduce_t = 0.0, comm_sendrecv_t = 0.0;
1068 long int comm_allreduce_q = 0.0,comm_sendrecv_q = 0.0;
1069 int comm_allreduce_c = 0,comm_sendrecv_c = 0;
1070 std::array< std::array<double,4> ,3> min_max_avg_sd_t_q_c_sendrecv_comm ;
1071 for (std::array<double,4>& arr: min_max_avg_sd_t_q_c_sendrecv_comm)
1072 for (
double & d : arr)
1074 std::array< std::array<double,4> ,3> min_max_avg_sd_t_q_c_allreduce_comm = min_max_avg_sd_t_q_c_sendrecv_comm ;
1075 Counter& c_timeloop = get_counter(STD_COUNTERS::timeloop);
1076 int nb_ts = c_timeloop.
count_;
1077 nb_ts = std::max(nb_ts,1);
1079 Counter& c_total_time = get_counter(STD_COUNTERS::total_execution_time);
1080 Counter& c_mpi_sendrecv = get_counter(STD_COUNTERS::mpi_sendrecv);
1081 Counter& c_virtual_swap = get_counter(STD_COUNTERS::virtual_swap);
1082 Counter& c_system_solver= get_counter(STD_COUNTERS::system_solver);
1083 Counter& c_backup = get_counter(STD_COUNTERS::backup_file);
1084 Counter& c_todevice = get_counter(STD_COUNTERS::gpu_copytodevice);
1085 Counter& c_gpu_l = get_counter(STD_COUNTERS::gpu_library);
1086 Counter& c_gpu_k = get_counter(STD_COUNTERS::gpu_kernel);
1087 Counter& c_fromdevice = get_counter(STD_COUNTERS::gpu_copyfromdevice);
1088 Counter& c_io_seq = get_counter(STD_COUNTERS::IO_EcrireFicPartageBin);
1089 Counter& c_io_par = get_counter(STD_COUNTERS::IO_EcrireFicPartageMPIIO);
1090 Counter& c_petsc=get_counter(STD_COUNTERS::petsc_solver);
1091 Counter& c_allocfree=get_counter(STD_COUNTERS::gpu_malloc_free);
1096 double total_time = c_total_time.
total_time_.count();
1100 double total_comm_time=0.;
1104 const double nb_it_per_solver_calls= solver_calls>0 ?
static_cast<double>(
Process::mp_max(
static_cast<double>(c_system_solver.
quantity_))) /solver_calls :
1105 static_cast<double>(Process::mp_max(static_cast<double>(c_system_solver.quantity_))) ;
1106 if (max_nb_backup>0)
1108 total_nb_backup_ += c_backup.
count_;
1109 total_data_exchange_per_backup_ += total_quantity / (max_nb_backup *1024*1024);
1112 auto write_globalTU_line = [&] (
const Counter& c_to_print_,std::ostringstream & line)
1114 if (c_to_print_.
count_>0 )
1117 int count = c_to_print_.
count_;
1118 line << std::left <<std::setw(counter_description_width) << c_to_print_.
description_ <<separator ;
1120 line << std::left << std::setw(time_per_step_width) <<t << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << fmt(
"%4.1f", t_c/time_tl*100);
1123 double n =
static_cast<double>(count)/nb_ts;
1124 line << separator <<std::left << std::setw(count_per_ts_width) << std::round(n) << std::setprecision(7);
1130 auto write_globalTU_line_custom_counters = [&] (
const Counter& c_to_print_,std::ostringstream & line)
1132 if (c_to_print_.
count_>0)
1135 int count = c_to_print_.
count_;
1136 line << std::left <<std::setw(counter_description_width) <<
"Custom_counter::"+c_to_print_.
description_ <<separator ;
1137 double t = nb_ts>0 ? t_c/nb_ts : t_c;
1138 line << std::left << std::setw(time_per_step_width) <<t << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << t_c/time_tl*100 ;
1141 double n =
static_cast<double>(count)/nb_ts;
1142 line << separator <<std::left << std::setw(count_per_ts_width) << std::round(n) << std::setprecision(7);
1144 line << separator <<std::setw(level_width) << c_to_print_.
level_ << std::endl;
1147 for (
int i =
static_cast<int>(STD_COUNTERS::backup_file); i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1149 Counter& c_com = *std_counters_[i];
1155 if (c_com.
family_==
"MPI_allreduce")
1159 comm_allreduce_c += c_com.
count_;
1161 if (c_com.
family_==
"MPI_sendrecv")
1165 comm_sendrecv_c += c_com.
count_;
1174 for (
const auto & pair : custom_counter_map_str_to_counter_)
1176 Counter& c_com = *pair.second;
1179 if (c_com.
family_==
"MPI_allreduce")
1183 comm_allreduce_c += c_com.
count_;
1185 if (pair.second->family_==
"MPI_sendrecv")
1189 comm_sendrecv_c += c_com.
count_;
1198 double bandwidth = 1.1e30;
1200 bandwidth =
static_cast<double>(c_mpi_sendrecv.
quantity_)/ (c_mpi_sendrecv.
total_time_.count() + MINFLOAT);
1205 double theoric_comm_time = 0.0;
1207 theoric_comm_time =
static_cast<double>(comm_allreduce_c) * allreduce_peak_perf +
static_cast<double>(comm_sendrecv_c) / max_bandwidth;
1211 double total_time_avg=0.0, total_time_max=0.0;
1214 total_time_avg = Process::mp_sum(c_timeloop.total_time_.count())/nb_procs;
1215 total_time_max = Process::mp_max(c_timeloop.total_time_.count());
1219 total_time_avg = Process::mp_sum(c_total_time.total_time_.count())/nb_procs;
1220 total_time_max = Process::mp_max(c_total_time.total_time_.count());
1222 double wait_time = (comm_sendrecv_t+ comm_allreduce_t)- theoric_comm_time;
1223 double wait_fraction;
1224 if (total_time_avg == 0)
1227 wait_fraction = wait_time / (total_time_avg + MINFLOAT);
1228 wait_fraction = 0.1 * floor(wait_fraction * 1000);
1229 if (wait_fraction < 0.)
1231 if (wait_fraction > 100.)
1232 wait_fraction = 100.;
1242 debit_seq =
static_cast<int>(std::floor(
static_cast<double>(
Process::mp_sum(
static_cast<double>(c_io_seq.
quantity_))) / (1024 * 1024) /com_time_seq));
1244 debit_par =
static_cast<int>(std::floor(
static_cast<double>(
Process::mp_sum(
static_cast<double>(c_io_par.
quantity_))) / (1024 * 1024) /com_time_par));
1247 min_max_avg_sd_t_q_c_allreduce_comm = compute_min_max_avg_sd(comm_allreduce_t,comm_allreduce_q,comm_allreduce_c);
1248 min_max_avg_sd_t_q_c_sendrecv_comm = compute_min_max_avg_sd(comm_sendrecv_t,comm_sendrecv_q,comm_sendrecv_c);
1253 if (message ==
"Computation start-up statistics")
1255 CPUInfo cpu = get_cpu();
1256 spaces.assign((max_str_length_-27)/2,
' ');
1257 file_header << spaces <<
"# Global performance file #"<< std::endl;
1258 file_header << std::endl;
1259 file_header <<
"This is the global file for tracking performance in TRUST. It stores aggregated quantities." <<std::endl;
1260 file_header <<
"More detailed statistics can be found in the "<<
Objet_U::nom_du_cas() <<
"_csv.TU file" <<std::endl;
1262 file_header <<
"For time loop, only standard counters of level 1 are printed alongside your custom counters" << std::endl;
1263 file_header <<
"Time is given in seconds"<< std::endl <<std::endl;
1264 file_header << line_sep_cpu << std::endl;
1265 spaces.assign((max_str_length_-26)/2,
' ');
1266 file_header << spaces <<
"Context of the computation"<< std::endl;
1267 file_header << line_sep_cpu << std::endl;
1268 file_header << std::left << std::setw(header_txt_width)<<
"Date:" << get_date() << std::endl;
1269 file_header << std::left << std::setw(header_txt_width)<<
"OS:" << get_os() << std::endl;
1270 file_header << std::left << std::setw(header_txt_width) <<
"CPU model : " << cpu.
model << std::endl;
1271 file_header << std::left << std::setw(header_txt_width) <<
"Total number of threads:" << cpu.
num_threads << std::endl;
1274 GPUInfo gpu = get_gpu();
1275 file_header <<
"GPU model: " << gpu.
name << std::endl;
1276#ifdef TRUST_USE_CUDA
1277 file_header <<
"CUDA runtime version: " << gpu.
runtime_version << std::endl;
1278 file_header <<
"CUDA drivers version: " << gpu.
driver_version << std::endl;
1280#ifdef TRUST_USE_ROCM
1281 file_header <<
"HIP runtime version: " << gpu.
runtime_version << std::endl;
1282 file_header <<
"HIP drivers version: " << gpu.
driver_version << std::endl;
1286 file_header <<
"GPU model: "<<
"No GPU used for the computation" << std::endl;
1287 file_header << std::left << std::setw(header_txt_width) <<
"Nb procs used for the computation: " << nb_procs << std::endl;
1288 file_header << std::left << std::setw(header_txt_width) <<
"TRUST version: " << TRUST_VERSION << std::endl ;
1289 file_header << std::left << std::setw(header_txt_width) <<
"Total number of elements used for the calculation: " << nb_elem_tot << std::endl << std::endl;
1290 file_header << line_sep_cpu << std::endl;
1291 spaces.assign((max_str_length_-message_width)/2,
' ');
1292 file_header << spaces<<message << std::endl;
1293 file_header << line_sep_cpu << std::endl;
1294 file_header << std::left << std::setw(text_width)<<
"Total time of the start-up: " << std::left <<std::setw(number_width) << c_total_time.
total_time_.count() << std::endl;
1296 else if (message ==
"Time loop statistics")
1298 file_header << line_sep_cpu << std::endl;
1299 spaces.assign((max_str_length_-message_width)/2,
' ');
1300 file_header << spaces<<message << std::endl;
1301 file_header << line_sep_cpu << std::endl;
1304 Cerr <<
"No time step after cache filling was computed" << finl;
1307 if (nb_steps_elapsed_>0)
1309 if (nb_steps_elapsed_>1)
1310 file_header <<
"The " << nb_steps_elapsed_<<
" first time steps are not accounted for the computation of the time loop statistics"<< std::endl;
1312 file_header <<
"The first time step is not accounted for the computation of the time loop statistics"<< std::endl;
1314 file_header << std::left <<std::setw(text_width)<<
"Total time of the time loop: "<< std::left <<std::setw(number_width) << time_tl << std::endl;
1315 file_header << std::left <<std::setw(text_width) <<
"Number of time steps: " << std::left <<std::setw(number_width) << nb_ts << std::endl;
1316 file_header << std::left <<std::setw(text_width) <<
"Skipped time steps: " << std::left <<std::setw(number_width) << nb_steps_elapsed_ << std::endl;
1317 file_header << std::left <<std::setw(text_width) <<
"Average time per time step: " << std::left <<std::setw(number_width) << time_tl/nb_ts << endl;
1318 file_header << std::left <<std::setw(text_width) <<
"Standard deviation between time steps: " << std::left <<std::setw(number_width) << std::sqrt(std::max(0., c_timeloop.
var_time_per_step_)) << std::endl;
1319 file_header << std::left <<std::setw(text_width) <<
"Time elapsed in the skipped time steps: " << std::left <<std::setw(number_width) << time_skipped_ts_.count() <<std::endl << std::endl;
1321 file_header << std::left <<std::setw(text_width) <<
"Percent of total time spend in communication: " << std::left <<std::setw(number_width) << 100* total_comm_time / total_time << std::endl;
1323 else if (message ==
"Post-resolution statistics")
1325 file_header << line_sep_cpu << std::endl;
1326 spaces.assign((max_str_length_-message_width)/2,
' ');
1327 file_header << spaces<<message << std::endl;
1328 file_header << line_sep_cpu << std::endl;
1329 file_header << std::left <<std::setw(text_width) <<
"Time of the post-resolution: " << std::left <<std::setw(number_width) << c_total_time.
total_time_.count() << std::endl;
1330 captions << std::endl;
1333 captions << line_sep_cpu << std::endl;
1334 captions <<
"Max waiting time big => probably due to a bad partitioning" << std::endl;
1335 captions <<
"Communications > 30% => too many processors or network too slow" << std::endl;
1336 captions << line_sep_cpu << std::endl;
1337 captions << std::endl;
1339 captions << std::left << std::setw(text_width) <<
"Total time for the whole computation" << std::left <<std::setw(number_width) << computation_time_.count()<< std::endl<< std::endl;
1342 Process::exit(
"You are trying to get stats of an unknown computation step");
1344 if(message ==
"Time loop statistics" && c_total_time.
total_time_.count()>1.0e-12 && c_timeloop.
total_time_.count()>1.0e-12)
1347 double other = time_tl/nb_ts;
1348 perfs_TU<<std::endl;
1349 perfs_TU << std::left <<std::setw(counter_description_width) <<
"Standard counter description" << separator << std::setw(time_per_step_width) <<
"Time/step" << separator << std::setw(percent_loop_time_width) <<
"% loop time" << separator << std::setw(count_per_ts_width) <<
"Call(s)/step"<<std::endl;
1350 perfs_TU << line_sep_tabular << std::endl;
1351 for (
int i =
static_cast<int>(STD_COUNTERS::system_solver); i< static_cast<int>(STD_COUNTERS::petsc_solver); i++)
1353 Counter& c_to_print = *std_counters_[i];
1355 write_globalTU_line(c_to_print,perfs_TU);
1358 if (!custom_counter_map_str_to_counter_.empty())
1360 perfs_TU << std::endl;
1361 perfs_TU << std::left <<std::setw(counter_description_width) <<
"Custom counter description" << separator << std::setw(time_per_step_width) <<
"Time/step" << separator << std::setw(percent_loop_time_width) <<
"% loop time" << separator << std::setw(count_per_ts_width) <<
"Call(s)/step"<< separator <<std::setw(level_width) <<
"Level" <<std::endl;
1362 perfs_TU << line_sep_tabular_custom<<std::endl;
1363 for (
const auto & pair : custom_counter_map_str_to_counter_)
1365 Counter& c_to_print = *pair.second;
1366 write_globalTU_line_custom_counters(c_to_print, perfs_TU);
1369 perfs_TU << std::left <<std::setw(counter_description_width) <<
"Other operations" << separator << std::setw(time_per_step_width) << other << separator << std::setprecision(3) << std::setw(percent_loop_time_width) << fmt(
"%4.1f", other/(time_tl/nb_ts)*100) << separator <<std::endl;
1371 if (max_virtual_swap_c>0)
1373 if (message==
"Time loop statistics")
1376 perfs_TU << std::left <<std::setw(text_width) <<
"Number of virtual exchanges per time step:" << std::left <<std::setw(number_width) << max_virtual_swap_c/nb_ts << std::endl;
1378 perfs_TU << std::left <<std::setw(text_width) <<
"Number of virtual exchanges" << std::left <<std::setw(number_width) << max_virtual_swap_c << std::endl;
1381 perfs_TU << std::left <<std::setw(text_width) <<
"Number of virtual exchanges:" << std::left <<std::setw(number_width) << max_virtual_swap_c << std::endl;
1383 if (min_max_avg_sd_t_q_c_allreduce_comm[2][1]>0 && nb_ts>0)
1385 double allreduce_per_ts = (double) min_max_avg_sd_t_q_c_allreduce_comm[2][1]/nb_ts;
1386 perfs_TU << std::left <<std::setw(text_width) <<
"Maximum number of MPI allreduce per time step" << std::left <<std::setw(number_width) << allreduce_per_ts << std::endl;
1387 if (allreduce_per_ts > 30.0 && message==
"Time loop statistics")
1389 perfs_TU << std::endl << line_sep_cpu << std::endl;
1390 perfs_TU <<
" Warning: number of MPI allreduce calls per time step is high. Contact TRUST team to run massive parallel calculations" << std::endl;
1391 perfs_TU << line_sep_cpu<< std::endl;
1394 int tmp = c_system_solver.
count_;
1397 perfs_TU << std::endl;
1398 avg_solv_time = avg_solv_time / tmp;
1399 if (!(message==
"Time loop statistics"))
1402 perfs_TU << std::left <<std::setw(text_width) <<
"Number of calls to the linear solver per time step: " << std::left <<std::setw(number_width) << static_cast<double>(tmp) / nb_ts << std::endl;
1404 perfs_TU << std::left <<std::setw(text_width) <<
"Number of call to the linear solver: " << std::left <<std::setw(number_width) << tmp << std::endl;
1405 perfs_TU << std::left <<std::setw(text_width) <<
"Average time of the resolution of the linear problem per call: " << std::left <<std::setw(number_width) << avg_solv_time << std::endl;
1407 perfs_TU << std::left <<std::setw(text_width) <<
"Average number of iteration of the linear solver per call: " << std::left <<std::setw(number_width) << nb_it_per_solver_calls << std::endl << std::endl;
1410 auto compute_percent_and_write_tabular_line = [& perfs_GPU, & nb_ts, & time_tl, &separator] (
const Counter& c_,
const std::string str)
1412 double max_time = c_.time_alone_.count();
1413 double calls = c_.count_/nb_ts;
1414 double t_ts = max_time/nb_ts;
1415 double bw = max_time>0 ?
static_cast<double>(c_.quantity_)/(1024.*1024.*1024*max_time) : 0.;
1416 double percent = 100*max_time/time_tl;
1417 perfs_GPU << std::left << std::setw(counter_description_width) << str <<separator << std::setw(time_per_step_width) << t_ts<<separator << std::setw(percent_loop_time_width) << fmt(
"%4.1f", percent) <<separator<< std::setw(count_per_ts_width) << calls <<separator;
1419 perfs_GPU << fmt(
"%3.1f GB/s", bw) << std::endl;
1422 perfs_GPU << std::endl;
1425 if (copy_to_device_count>0 && nb_ts >0 && message==
"Time loop statistics")
1427 perfs_GPU << std::endl << line_sep_gpu << std::endl;
1428 spaces.assign((max_str_length_-14)/2,
' ');
1429 perfs_GPU << spaces <<
"GPU statistics" << std::endl;
1430 perfs_GPU << line_sep_gpu<<std::endl;
1431 perfs_GPU << std::left <<std::setw(counter_description_width) <<
"Counter description" << separator <<std::setw(time_per_step_width) <<
"Time per step" <<separator<< std::setw(percent_loop_time_width) <<
"% loop time" <<separator<< std::setw(count_per_ts_width) <<
"Call(s)/step" <<separator<< std::setw(bandwith_width)<<
"Bandwidth"<<std::endl;
1432 perfs_GPU << line_sep_gpu << std::endl;
1433 double ratio_gpu_library = compute_percent_and_write_tabular_line(c_gpu_l,
"Libraries: ");
1434 double ratio_gpu_kernel = compute_percent_and_write_tabular_line(c_gpu_k,
"Kernels: ");
1435 double ratio_gpu = ratio_gpu_kernel+ratio_gpu_library;
1436 double ratio_copy = compute_percent_and_write_tabular_line(c_todevice,
"Copy host to device: ");
1437 ratio_copy += compute_percent_and_write_tabular_line(c_fromdevice,
"Copy device to host: ");
1438 double ratio_comm = 100.0 * (total_comm_time)/time_tl;
1439 double ratio_allocfree = compute_percent_and_write_tabular_line(c_allocfree,
"Alloc/Free on device: ");
1440 double ratio_cpu = 100 * cpu_time/time_tl;
1442 ratio_cpu = 100 - ratio_gpu - ratio_copy - ratio_allocfree - ratio_comm;
1443 perfs_GPU << std::setprecision(2) <<
"GPU: " << ratio_gpu <<
"% Copy H<->D: " << ratio_copy <<
"% Alloc/free: " << ratio_allocfree <<
"% Comm: "<< ratio_comm <<
"% CPU & I/O: " << ratio_cpu <<
"%"<<std::endl;
1446 Cerr <<
"==============================================================================================" << finl;
1447 Cerr <<
"[GPU] Warning: Only " << 0.1*int(10*ratio_gpu) <<
" % of the time calculation is spent on GPU." << finl;
1448 if (ratio_gpu_library==0)
1449 Cerr <<
"[GPU] First add a GPU solver !" << finl;
1451 Cerr <<
"[GPU] Probably some algorithms used are not ported yet on GPU. Contact TRUST team." << finl;
1452 Cerr <<
"==============================================================================================" << finl;
1455 if (message==
"Time loop statistics")
1457 if (debit_seq>0 || debit_par>0)
1459 perfs_IO << std::endl << line_sep_cpu << std::endl;
1460 spaces.assign((max_str_length_-message_width-4)/2,
' ');
1461 perfs_IO << spaces << message <<
": IO" << std::endl;
1462 perfs_IO << line_sep_cpu<<std::endl;
1465 perfs_IO << std::left <<std::setw(text_width) <<
"Output write sequential: " << std::left <<std::setw(number_width) << debit_seq <<
"MB/s"<< std::endl;
1467 perfs_IO << std::left <<std::setw(text_width) <<
"Output write parallel: " << std::left <<std::setw(number_width) << debit_par <<
"MB/s" << std::endl;
1468 if (total_nb_backup_>0)
1470 perfs_IO << std::left <<std::setw(text_width) <<
"Total number of back-up: " << std::left <<std::setw(number_width) << total_nb_backup_ << std::endl;
1471 perfs_IO << std::left <<std::setw(text_width) <<
"Total amount of data per back-up: " << std::left <<std::setw(number_width) << total_data_exchange_per_backup_ <<
"MB"<< std::endl;
1473 if(min_max_avg_sd_t_q_c_sendrecv_comm[2][1] > 0)
1477 perfs_IO<< std::endl<<
"---------------------------------------------------------------------------------------------------------"<< std::endl;
1478 perfs_IO<<
"Warning: One or several PETSc solvers are used and thus the communication time below are under-estimated."<< std::endl;
1479 perfs_IO<<
"---------------------------------------------------------------------------------------------------------"<< std::endl<< std::endl;
1481 double fraction = 0.0;
1482 fraction = (comm_sendrecv_t + comm_allreduce_t)/ (total_time + MINFLOAT);
1483 fraction = 0.1 * floor(fraction * 1000);
1484 if (fraction > 100.)
1486 perfs_IO << std::left <<std::setw(text_width+10) <<
"Average of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction <<
"%" << std::endl;
1487 fraction = (min_max_avg_sd_t_q_c_sendrecv_comm[0][1] + min_max_avg_sd_t_q_c_allreduce_comm[0][1])/ (total_time_max + MINFLOAT);
1488 fraction = 0.1 * floor(fraction * 1000);
1489 if (fraction > 100.)
1491 perfs_IO << std::left <<std::setw(text_width+10) <<
"Max of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction <<
"%" << std::endl;
1492 fraction = (min_max_avg_sd_t_q_c_sendrecv_comm[0][0] + min_max_avg_sd_t_q_c_allreduce_comm[0][0])/ (total_time_max + MINFLOAT);
1493 fraction = 0.1 * floor(fraction * 1000);
1494 perfs_IO << std::left <<std::setw(text_width+10) <<
"Min of the fraction of the time spent in communications between processors: " << std::left <<std::setw(number_width) << fraction <<
"%" << std::endl;
1495 perfs_IO << std::left <<std::setw(text_width+10) <<
"Time of one mpsum measured by an internal bench over 0.1s (network latency): ";
1496 if (allreduce_peak_perf == 0.)
1497 perfs_IO <<
"not measured (total running time too short <10s)" << std::endl;
1499 perfs_IO << std::left <<std::setw(number_width) << allreduce_peak_perf << std::endl;
1500 perfs_IO << std::left <<std::setw(text_width+10) <<
"Network maximum bandwidth on all processors: " << std::left <<std::setw(number_width) << fmt(
"%4.1f GB/s",max_bandwidth * 1.e-9) << std::endl ;
1502 perfs_IO << std::left <<std::setw(text_width+10) <<
"Total network traffic: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) *
Process::nproc() / nb_ts * 1e-6 <<
"MB/time step" << std::endl;
1504 perfs_IO << std::left <<std::setw(text_width+10) <<
"Total network traffic: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) *
Process::nproc()* 1e-6 <<
"MB" << std::endl;
1505 perfs_IO << std::left <<std::setw(text_width+10) <<
"Average message size: " << std::left <<std::setw(number_width) << static_cast<double>(comm_sendrecv_q) /
static_cast<double>(comm_sendrecv_c)* 1e-3 <<
"kB" << std::endl;
1506 perfs_IO << std::left <<std::setw(text_width+10) <<
"Min waiting time: " << std::left <<std::setw(number_width) << min_wait_fraction <<
"% of total time"<< std::endl;;
1507 perfs_IO << std::left <<std::setw(text_width+10) <<
"Max waiting time: " << std::left <<std::setw(number_width) << max_wait_fraction<<
"% of total time"<< std::endl;;
1508 perfs_IO << std::left <<std::setw(text_width+10) <<
"Avg waiting time: " << std::left <<std::setw(number_width) << avg_wait_fraction<<
"% of total time"<< std::endl;;
1514 auto printing_mode = ios::app;
1515 if (message==
"Computation start-up statistics")
1516 printing_mode= ios::out;
1517 SFichier file(globalTU, printing_mode);
1518 file << file_header.str();
1519 file << perfs_TU.str();
1520 file << perfs_IO.str();
1521 file << perfs_GPU.str();
1522 file << captions.str();
1523 clean_stringstream(file_header);
1524 clean_stringstream(perfs_TU);
1525 clean_stringstream(perfs_GPU);
1526 clean_stringstream(perfs_IO);
1527 clean_stringstream(captions);
1535 if (counter_level <=0)
1536 Process::exit(
"Custom counters should not be set with a zero or negative level value");
1537 if (custom_counter_map_str_to_counter_.count(counter_description)==0)
1539 auto result =custom_counter_map_str_to_counter_.emplace(counter_description, std::make_unique<Counter>(counter_level, counter_description, counter_family ,is_comm, is_gpu));
1541 Process::exit(
"Failed to insert the new custom counter in the custom counter map");
1547 if (!counters_stop_)
1549 Counter& c = get_counter(std_cnt);
1550 if (counter_lvl == -100000)
1553 check_begin(c, counter_lvl,t);
1565 if (!counters_stop_)
1567 Counter& c = get_counter(custom_count_name);
1568 if (counter_lvl == -100000)
1571 check_begin(c, counter_lvl,t);
1584 if (!counters_stop_)
1586 Counter& c = get_counter(std_cnt);
1589 c.
end_count_(count_increment, quantity_increment,t);
1604 if (!counters_stop_)
1606 Counter& c = get_counter(custom_count_name);
1608 assert(custom_counter_map_str_to_counter_.count(custom_count_name) > 0);
1610 c.
end_count_(count_increment, quantity_increment,t);
1623 Process::exit(
"The counter are already stop, you can't stop them two times in a row \n");
1624 if (last_opened_counter_ !=
nullptr)
1626 Counter* c = last_opened_counter_;
1629 while (c!=
nullptr && c->
level_ >= 0)
1633 c->
time_ts_ += time_elapsed_before_stop;
1638 counters_stop_=
true;
1647 if (counters_stop_==
false)
1648 Process::exit(
"Try to restart counters but they have never been stopped before");
1649 if (last_opened_counter_ !=
nullptr)
1651 Counter* c = last_opened_counter_;
1653 while (c !=
nullptr && c->
level_ >= 0)
1661 counters_stop_=
false;
1667 for (
int i = 1 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1669 Counter& c = *std_counters_[i];
1672 for (
const auto &
pair : custom_counter_map_str_to_counter_)
1681 nb_steps_elapsed_=time_step_elapsed;
1682 if (time_step_elapsed>0)
1689 Process::exit(
"The counters are stop, you can't access the total time");
1690 Counter& c = get_counter(STD_COUNTERS::total_execution_time);
1693 return (computation_time_.count());
1698 Counter& c = get_counter(name);
1707 Counter& c = get_counter(name);
1716 Counter& c = get_counter(name);
1726 Counter& c = get_counter(name);
1736 if (last_opened_counter_==
nullptr)
1737 Process::exit(
"You are trying to start the time loop before the start-up");
1744 Process::exit(
"The time loop has not started, but you are trying to end it");
1750 assert (last_opened_counter_!=
nullptr);
1752 if (last_opened_counter_ ==
nullptr)
1753 Process::exit(
"You are trying to start a time step outside the time loop");
1755 for (
int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1757 Counter& c_std = *std_counters_[i];
1760 for (
const auto &
pair : custom_counter_map_str_to_counter_)
1765 Counter* c = last_opened_counter_;
1783 if (last_opened_counter_ ==
nullptr)
1784 Process::exit(
"You are trying to compute the statistics of a time steps but have not open any counter");
1786 Process::exit(
"You are trying to compute time loop statistics outside of the time loop");
1787 double step =
static_cast<double>(tstep) -
static_cast<double>(nb_steps_elapsed_);
1788 auto compute = [step](
Counter& c)
1804 for (
int i =0 ; i< static_cast<int>(STD_COUNTERS::NB_OF_STD_COUNTER); i++)
1806 Counter& c_std = *std_counters_[i];
1809 if (!custom_counter_map_str_to_counter_.empty())
1811 for (
const auto &
pair : custom_counter_map_str_to_counter_)
1820 end_cache_ = tstep >= nb_steps_elapsed_;
1823 time_skipped_ts_ = get_counter(STD_COUNTERS::timeloop).total_time_;
1824 computation_time_ += get_counter(STD_COUNTERS::total_execution_time).total_time_;
1833 if (last_opened_counter_==
nullptr)
1835 return last_opened_counter_->level_;
1846 Counter& c_time = get_counter(STD_COUNTERS::total_execution_time);
1851 print_global_TU(message);
1852 print_performance_to_csv(message);
1856 counters_stop_=
false;
1862 Process::exit(
"You try to start the gpu timer and it is already running");
1863 gpu_timer_start_=
now();
1870 Process::exit(
"You try to stop the GPU timer, but it has not been started yet");
1883 return gpu_verbose_;
1891 return init_device_;
1903 gpu_timer_count_ += to_add;
1907 return gpu_timer_count_;
1918Perf_counters::~Perf_counters()=
default;
1929 pimpl_->create_custom_counter_impl(counter_description , counter_level, counter_family , is_comm, is_gpu);
1934 pimpl_->begin_count_impl(std_cnt,counter_lvl);
1939 pimpl_->begin_count_impl(custom_count_name,counter_lvl);
1944 pimpl_->end_count_impl(custom_count_name,count_increment,quantity_increment);
1949 pimpl_->end_count_impl(std_cnt,count_increment,quantity_increment);
1954 pimpl_->stop_counters_impl();
1959 pimpl_->restart_counters_impl();
1964 pimpl_->reset_counters_impl();
1969 pimpl_->print_TU_files_impl(message);
1974 return pimpl_->get_computation_time_impl();
1979 return pimpl_->get_total_time_impl(name);
1984 return pimpl_->get_total_time_impl(name);
1989 return pimpl_->get_time_since_last_open_impl(name);
1994 return pimpl_->get_time_since_last_open_impl(name);
1999 return pimpl_->running_impl(name);
2004 pimpl_->start_timeloop_impl();
2009 pimpl_->end_timeloop_impl();
2014 pimpl_->start_time_step_impl();
2019 pimpl_->end_time_step_impl(tstep);
2024 pimpl_->set_time_steps_elapsed_impl(n);
2029 return pimpl_->get_last_opened_counter_level_impl();
2034 pimpl_->record_nb_elem_impl(nb_elem);
2039 pimpl_->start_gpu_timer_impl();
2044 pimpl_->stop_gpu_timer_impl();
2049 return pimpl_->is_gpu_verbose_on_impl();
2054 pimpl_->set_gpu_verbose_impl(on);
2059 return pimpl_->get_init_device_impl();
2064 pimpl_->set_init_device_impl(init);
2069 return pimpl_->get_gpu_timer_impl();
2074 pimpl_->add_to_gpu_timer_counter_impl(to_add);
2079 return pimpl_->get_gpu_timer_counter_impl();
2084 return pimpl_->compute_gpu_time_impl();
static bool disable_TU
Flag to disable or not the writing of the .TU files.
static bool stat_per_proc_perf_log
Flag to enable the writing of the statistics detailed per processor in _csv.TU file.
static const Nom & nom_du_cas()
Renvoie une reference constante vers le nom du cas.
bool get_init_device_impl() const
void print_TU_files_impl(const std::string &message)
void restart_counters_impl()
Restart all counters, has to be called on every processor simultaneously.
bool is_gpu_verbose_on_impl() const
int get_last_opened_counter_level_impl() const
bool get_gpu_timer_impl() const
int get_gpu_timer_counter_impl() const
double compute_gpu_time_impl()
void stop_counters_impl()
Stop all counters, has to be called on every processor simultaneously.
void start_time_step_impl()
void start_gpu_timer_impl()
void begin_count_impl(const STD_COUNTERS &std_cnt, int counter_lvl)
void set_gpu_fence_impl(bool fence)
void create_custom_counter_impl(std::string counter_description, int counter_level, std::string counter_family, bool is_comm, bool is_gpu)
void end_count_impl(const STD_COUNTERS &std_cnt, int count_increment, long int quantity_increment)
End the count of a counter and update the counter values.
bool get_gpu_fence_impl() const
double get_total_time_impl(const STD_COUNTERS &name)
void set_time_steps_elapsed_impl(int time_step_elapsed)
double get_time_since_last_open_impl(const STD_COUNTERS &name)
bool running_impl(const STD_COUNTERS name)
void record_nb_elem_impl(trustIdType nb_elem)
bool get_use_gpu_impl() const
void add_to_gpu_timer_counter_impl(int to_add=1)
void set_gpu_verbose_impl(bool on)
void end_time_step_impl(long int tstep)
Compute for each counter open during a time step avg_time_per_step_, min_time_per_step_,...
void reset_counters_impl()
void set_init_device_impl(bool init)
std::chrono::time_point< clock > time_point
double get_computation_time_impl()
void start_timeloop_impl()
std::chrono::duration< double > duration
void stop_gpu_timer_impl()
std::chrono::high_resolution_clock clock
int get_last_opened_counter_level() const
bool is_running(const STD_COUNTERS &name)
Check whether a counter is already running. Should rarely be used!
std::chrono::time_point< clock > time_point
int get_gpu_timer_counter() const
void end_time_step(long int tstep)
This function compute statistics per time steps of counters used at least once during a time step.
double stop_gpu_timer_and_compute_gpu_time()
void start_timeloop()
Set time_loop_ to true in order to account for cache properly.
Perf_counters(const Perf_counters &)=delete
void set_gpu_verbose(bool on)
void set_gpu_fence(bool fence)
bool get_gpu_timer() const
std::chrono::duration< double > duration
void record_nb_elem(trustIdType nb_elem)
void begin_count(const STD_COUNTERS &std_cnt, int counter_lvl=-100000)
void set_init_device(bool init)
bool get_gpu_fence() const
void set_nb_time_steps_elapsed(int n)
double get_time_since_last_open(const STD_COUNTERS &name)
Give as a double the time (in second) elapsed in the operation tracked by the standard counter call n...
void add_to_gpu_timer_counter(int to_add)
void stop_counters()
Stop all counters, has to be called on every processor simultaneously.
double get_total_time(const STD_COUNTERS &name)
Give as a double the total time (in second) elapsed in the operation tracked by the standard counter ...
bool get_init_device() const
void restart_counters()
Restart all counters, has to be called on every processor simultaneously.
void end_timeloop()
Set time_loop_ to false as we exit the time loop.
void reset_counters()
Reset counters to zero, used between the start-up of the computation, the computation itself and the ...
double compute_time(time_point start)
return time since start in seconds
void print_TU_files(const std::string &message)
Function that encapsulate the two functions that writes the TU files.
void end_count(const std::string &custom_count_name, int count_increment=1, long int quantity_increment=0)
End the count of a counter and update the counter values.
void create_custom_counter(std::string counter_description, int counter_level, std::string counter_family="None", bool is_comm=false, bool is_gpu=false)
Create a new counter and add it to the map of custom counters.
void start_time_step()
, this function start statistics tracking for a time step. It has to be called at the start of each t...
bool is_gpu_verbose_on() const
double get_computation_time()
Update computation_time_ and return its value as a double (in seconds).
static double mp_min(double)
static double mp_max(double)
static bool is_parallel()
static int nproc()
renvoie le nombre de processeurs dans le groupe courant Voir Comm_Group::nproc() et PE_Groups::curren...
static double mp_sum(double)
Calcule la somme de x sur tous les processeurs du groupe courant.
static void barrier()
Synchronise tous les processeurs du groupe courant (attend que tous les processeurs soient arrives a ...
static int me()
renvoie mon rang dans le groupe de communication courant.
static void exit(int exit_code=-1)
Routine de sortie de TRUST dans une region Kokkos.
static int je_suis_maitre()
renvoie 1 si on est sur le processeur maitre du groupe courant (c'est a dire me() == 0),...
std::chrono::duration< double > duration
double max_time_per_step_
double avg_time_per_step_
double var_time_per_step_
time_point last_open_time_alone_
void end_count_(int count_increment, long int quantity_increment, time_point t_stop)
double min_time_per_step_
time_point last_open_time_
std::array< std::array< double, 4 >, 4 > compute_min_max_avg_sd_() const
update variables : avg_time_per_step_ , min_time_per_step_ , max_time_per_step_ , sd_time_per_step_
Counter(int counter_level, std::string counter_name, std::string counter_family="None", bool is_comm=false, bool is_gpu=false)
const std::string description_
std::chrono::high_resolution_clock clock
const std::string family_
void begin_count_(int counter_level, time_point t)
void set_parent(Counter *parent_counter)
std::chrono::time_point< clock > time_point
std::string runtime_version
std::string driver_version