The simple solver with performance debugging example..
This example depends on simple-solver-logging, minimal-cuda-solver.
Introduction
About the example
This example runs a solver on a test problem and shows how to use loggers to debug performance and convergence rate.
The commented program
creates a zero vector
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
return res;
}
static std::unique_ptr< Dense > create(std::shared_ptr< const Executor > exec, const dim< 2 > &size={}, size_type stride=0)
std::size_t size_type
Definition types.hpp:89
Definition matrix_data.hpp:126
utilities for computing norms and residuals
template <typename ValueType>
{
}
template <typename ValueType>
{
auto b_norm = gko::initialize<real_vec<ValueType>>({0.0}, exec);
return get_first_element(b_norm.get());
}
template <typename ValueType>
{
auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
Definition lin_op.hpp:117
LinOp * apply(ptr_param< const LinOp > b, ptr_param< LinOp > x)
Definition lin_op.hpp:129
std::shared_ptr< const Executor > get_executor() const noexcept
Definition polymorphic_object.hpp:243
const value_type * get_const_values() const noexcept
Definition dense.hpp:859
void compute_norm2(ptr_param< LinOp > result) const
constexpr T one()
Definition math.hpp:630
detail::cloned_type< Pointer > clone(const Pointer &p)
Definition utils_helper.hpp:173
typename detail::remove_complex_s< T >::type remove_complex
Definition math.hpp:260
A logger that accumulates the time of all operations. For each operation type (allocations, free, copy, internal operations i.e. kernels), the timing is taken before and after. This can create significant overhead since to ensure proper timings, calls to synchronize are required.
{
this->start_operation(exec, "allocate");
}
{
this->end_operation(exec, "allocate");
}
{
this->start_operation(exec, "free");
}
{
this->end_operation(exec, "free");
}
{
this->start_operation(to, "copy");
}
{
this->end_operation(to, "copy");
}
{
this->start_operation(exec, op->
get_name());
}
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
Definition executor.hpp:615
virtual void synchronize() const =0
Definition executor.hpp:258
virtual const char * get_name() const noexcept
virtual void on_copy_started(const Executor *exec_from, const Executor *exec_to, const uintptr &loc_from, const uintptr &loc_to, const size_type &num_bytes) const
Definition logger.hpp:177
virtual void on_free_started(const Executor *exec, const uintptr &location) const
Definition logger.hpp:155
virtual void on_allocation_started(const Executor *exec, const size_type &num_bytes) const
Definition logger.hpp:135
virtual void on_copy_completed(const Executor *exec_from, const Executor *exec_to, const uintptr &loc_from, const uintptr &loc_to, const size_type &num_bytes) const
Definition logger.hpp:190
virtual void on_allocation_completed(const Executor *exec, const size_type &num_bytes, const uintptr &location) const
Definition logger.hpp:146
virtual void on_operation_completed(const Executor *exec, const Operation *op) const
Definition logger.hpp:213
virtual void on_free_completed(const Executor *exec, const uintptr &location) const
Definition logger.hpp:164
virtual void on_operation_launched(const Executor *exec, const Operation *op) const
Definition logger.hpp:199
std::uintptr_t uintptr
Definition types.hpp:141
Helper which synchronizes and starts the time before every operation.
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
Helper to compute the end time and store the operation's time at its end. Also time nested operations.
void end_operation(
const gko::Executor* exec,
const std::string& name)
const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
make sure timings for nested operations are not counted twice
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
the position i of this vector holds the total time spend on child operations on nesting level i
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
This logger tracks the persistently allocated data
Store amount of bytes allocated on every allocation
{
storage[location] = num_bytes;
}
Reset the amount of bytes on every free
{
storage[location] = 0;
}
Write the data after summing the total from all allocations
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
Logs true and recurrent residuals of the solver
template <typename ValueType>
Definition custom-logger.cpp:34
Depending on the available information, store the norm or compute it from the residual. If the true residual norm could not be computed, store the value -1.0.
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
} else {
rec_res_norms.push_back(
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
} else {
true_res_norms.push_back(-1.0);
}
}
:
gko::log::Logger(
gko::log::Logger::iteration_complete_mask),
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
The Ginkgo namespace.
Definition abstract_factory.hpp:20
std::decay_t< T > * as(U *obj)
Definition utils_helper.hpp:307
Print usage help
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" <<
vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
value_type & at(size_type row, size_type col) noexcept
Definition dense.hpp:891
Parametrize the benchmark here Pick a value type
using ValueType = double;
using IndexType = int;
Pick a matrix format
Pick a solver
Pick a preconditioner type
Definition identity.hpp:88
Pick a residual norm reduction value
Pick an output file name
const auto of_name = "log.txt";
Simple shortcut
Print version information
static const version_info & get()
Definition version.hpp:139
Figure out where to run the code
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
Figure out where to run the code
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
static std::shared_ptr< CudaExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_cuda_alloc_mode, CUstream_st *stream=nullptr)
static std::shared_ptr< DpcppExecutor > create(int device_id, std::shared_ptr< Executor > master, std::string device_type="all", dpcpp_queue_property property=dpcpp_queue_property::in_order)
static std::shared_ptr< HipExecutor > create(int device_id, std::shared_ptr< Executor > master, bool device_reset, allocation_mode alloc_mode=default_hip_alloc_mode, CUstream_st *stream=nullptr)
static std::shared_ptr< OmpExecutor > create(std::shared_ptr< CpuAllocatorBase > alloc=std::make_shared< CpuAllocator >())
Definition executor.hpp:1396
executor where Ginkgo will perform the computation
const auto exec = exec_map.at(executor_string)();
Read the input matrix file directory
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
Read data: A is read from disk Create a StorageLogger to track the size of A
auto storage_logger = std::make_shared<loggers::StorageLogger>();
Add the logger to the executor
void add_logger(std::shared_ptr< const log::Logger > logger) override
Definition executor.hpp:838
Read the matrix A from file
auto A =
gko::share(gko::read<mtx>(std::ifstream(input_mtx), exec));
detail::shared_type< OwningPointer > share(OwningPointer &&p)
Definition utils_helper.hpp:224
Remove the storage logger
void remove_logger(const log::Logger *logger) override
Definition executor.hpp:851
Pick a maximum iteration count
const auto max_iters = A->get_size()[0];
Generate b and x vectors
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
Declare the solver factory. The preconditioner's arguments should be adapted if needed.
auto solver_factory =
solver::build()
.with_criteria(
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
Definition residual_norm.hpp:113
Declare the output file for all our loggers
std::ofstream output_file(of_name);
Do a warmup run
Clone x to not overwrite the original one
Generate and call apply on a solver
solver_factory->generate(A)->apply(b, x_clone);
}
Do a timed run
Clone x to not overwrite the original one
Synchronize ensures no operation are ongoing
Time before generate
auto g_tic = std::chrono::steady_clock::now();
Generate a solver
auto generated_solver = solver_factory->generate(A);
Time after generate
auto g_tac = std::chrono::steady_clock::now();
Compute the generation time
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
Write the generate time to the output file
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
Similarly time the apply
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
Compute the residual norm
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
Log the internal operations using the OperationLogger without timing
Create an OperationLogger to analyze the generate step
auto gen_logger = std::make_shared<loggers::OperationLogger>();
Add the generate logger to the executor
Generate a solver
auto generated_solver = solver_factory->generate(A);
Remove the generate logger from the executor
Write the data to the output file
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
Create an OperationLogger to analyze the apply step
auto apply_logger = std::make_shared<loggers::OperationLogger>();
Create a ResidualLogger to log the recurent residual
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
Solve the system
generated_solver->apply(b, x);
Write the data to the output file
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
Print solution
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
Print output file location
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}
Results
This is the expected standard output:
Solution, first ten entries:
[
0.252218
0.108645
0.0662811
0.0630433
0.0384088
0.0396536
0.0402648
0.0338935
0.0193098
0.0234653
];
The performance and residual data can be found in log.txt
Here is a sample output in the file log.txt:
Generate time (ns): 861
Apply time (ns): 108144
Residual_norm: 2.10788e-15
Generate operations times (ns):
Apply operations times (ns):
allocate: 14991
cg::initialize#8: 872
cg::step_1#5: 7683
cg::step_2#7: 7756
copy: 7751
csr::advanced_spmv#5: 21819
csr::spmv#3: 20429
dense::compute_dot#3: 18043
dense::compute_norm2#2: 16726
free: 8857
residual_norm::residual_norm#9: 3614
Recurrent Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
1.64195e-15
];
True Residual Norms:
[
4.3589
2.30455
1.46771
0.984875
0.741833
0.513623
0.384165
0.316439
0.227709
0.170312
0.0973722
0.0616831
0.0454123
0.031953
0.0161606
0.00657015
0.00264367
0.000858809
0.000286461
2.10788e-15
];
Comments about programming and debugging
The plain program
#include <algorithm>
#include <array>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <map>
#include <ostream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <ginkgo/ginkgo.hpp>
template <typename ValueType>
template <typename ValueType>
namespace utils {
template <typename ValueType>
std::unique_ptr<vec<ValueType>> create_vector(
ValueType value)
{
return res;
}
template <typename ValueType>
{
}
template <typename ValueType>
{
auto b_norm = gko::initialize<real_vec<ValueType>>({0.0}, exec);
return get_first_element(b_norm.get());
}
template <typename ValueType>
{
auto one = gko::initialize<vec<ValueType>>({1.0}, exec);
auto neg_one = gko::initialize<vec<ValueType>>({-1.0}, exec);
system_matrix->
apply(one, x, neg_one, res);
return compute_norm(res.get());
}
}
namespace loggers {
{
this->start_operation(exec, "allocate");
}
{
this->end_operation(exec, "allocate");
}
{
this->start_operation(exec, "free");
}
{
this->end_operation(exec, "free");
}
{
this->start_operation(to, "copy");
}
{
this->end_operation(to, "copy");
}
{
this->start_operation(exec, op->
get_name());
}
{
this->end_operation(exec, op->
get_name());
}
void write_data(std::ostream& ostream)
{
for (const auto& entry : total) {
ostream << "\t" << entry.first.c_str() << ": "
<< std::chrono::duration_cast<std::chrono::nanoseconds>(
entry.second)
.count()
<< std::endl;
}
}
private:
const std::string& name) const
{
nested.emplace_back(0);
start[name] = std::chrono::steady_clock::now();
}
void end_operation(
const gko::Executor* exec,
const std::string& name)
const
{
const auto end = std::chrono::steady_clock::now();
const auto diff = end - start[name];
total[name] += diff - nested.back();
nested.pop_back();
if (nested.size() > 0) {
nested.back() += diff;
}
}
mutable std::map<std::string, std::chrono::steady_clock::time_point> start;
mutable std::map<std::string, std::chrono::steady_clock::duration> total;
mutable std::vector<std::chrono::steady_clock::duration> nested;
};
{
storage[location] = num_bytes;
}
{
storage[location] = 0;
}
void write_data(std::ostream& ostream)
{
for (const auto& e : storage) {
total += e.second;
}
ostream << "Storage: " << total << std::endl;
}
private:
mutable std::unordered_map<gko::uintptr, gko::size_type> storage;
};
template <typename ValueType>
{
if (residual_norm) {
rec_res_norms.push_back(utils::get_first_element(
} else {
rec_res_norms.push_back(
}
if (solution) {
true_res_norms.push_back(utils::compute_residual_norm(
} else {
true_res_norms.push_back(-1.0);
}
}
matrix{matrix},
b{b}
{}
void write_data(std::ostream& ostream)
{
ostream << "Recurrent Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : rec_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
ostream << "True Residual Norms: " << std::endl;
ostream << "[" << std::endl;
for (const auto& entry : true_res_norms) {
ostream << "\t" << entry << std::endl;
}
ostream << "];" << std::endl;
}
private:
mutable std::vector<gko::remove_complex<ValueType>> rec_res_norms;
mutable std::vector<gko::remove_complex<ValueType>> true_res_norms;
};
}
namespace {
void print_usage(const char* filename)
{
std::cerr << "Usage: " << filename << " [executor] [matrix file]"
<< std::endl;
std::cerr << "matrix file should be a file in matrix market format. "
"The file data/A.mtx is provided as an example."
<< std::endl;
std::exit(-1);
}
template <typename ValueType>
{
std::cout << "[" << std::endl;
for (int i = 0; i < elements_to_print; ++i) {
std::cout <<
"\t" <<
vec->
at(i) << std::endl;
}
std::cout << "];" << std::endl;
}
}
int main(int argc, char* argv[])
{
using ValueType = double;
using IndexType = int;
const auto of_name = "log.txt";
if (argc == 2 && (std::string(argv[1]) == "--help")) {
std::cerr << "Usage: " << argv[0] << " [executor]" << std::endl;
std::exit(-1);
}
const auto executor_string = argc >= 2 ? argv[1] : "reference";
std::map<std::string, std::function<std::shared_ptr<gko::Executor>()>>
exec_map{
{"cuda",
[] {
}},
{"hip",
[] {
}},
{"dpcpp",
[] {
}},
{"reference", [] { return gko::ReferenceExecutor::create(); }}};
const auto exec = exec_map.at(executor_string)();
std::string input_mtx = "data/A.mtx";
if (argc == 3) {
input_mtx = std::string(argv[2]);
}
auto storage_logger = std::make_shared<loggers::StorageLogger>();
auto A =
gko::share(gko::read<mtx>(std::ifstream(input_mtx), exec));
const auto max_iters = A->get_size()[0];
auto b = utils::create_vector<ValueType>(exec, A->get_size()[0], 1.0);
auto x = utils::create_vector<ValueType>(exec, A->get_size()[0], 0.0);
auto solver_factory =
solver::build()
.with_criteria(
.with_reduction_factor(reduction_factor),
gko::stop::Iteration::build().with_max_iters(max_iters))
.with_preconditioner(preconditioner::create(exec))
.on(exec);
std::ofstream output_file(of_name);
{
solver_factory->generate(A)->apply(b, x_clone);
}
{
auto g_tic = std::chrono::steady_clock::now();
auto generated_solver = solver_factory->generate(A);
auto g_tac = std::chrono::steady_clock::now();
auto generate_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(g_tac - g_tic);
output_file << "Generate time (ns): " << generate_time.count()
<< std::endl;
auto a_tic = std::chrono::steady_clock::now();
generated_solver->apply(b, x_clone);
auto a_tac = std::chrono::steady_clock::now();
auto apply_time =
std::chrono::duration_cast<std::chrono::nanoseconds>(a_tac - a_tic);
output_file << "Apply time (ns): " << apply_time.count() << std::endl;
auto residual =
utils::compute_residual_norm(A.get(), b.get(), x_clone.get());
output_file << "Residual_norm: " << residual << std::endl;
}
{
auto gen_logger = std::make_shared<loggers::OperationLogger>();
auto generated_solver = solver_factory->generate(A);
output_file << "Generate operations times (ns):" << std::endl;
gen_logger->write_data(output_file);
auto apply_logger = std::make_shared<loggers::OperationLogger>();
auto res_logger = std::make_shared<loggers::ResidualLogger<ValueType>>(
A.get(), b.get());
generated_solver->add_logger(res_logger);
generated_solver->apply(b, x);
output_file << "Apply operations times (ns):" << std::endl;
apply_logger->write_data(output_file);
res_logger->write_data(output_file);
}
std::cout << "Solution, first ten entries: \n";
print_vector(x.get());
std::cout << "The performance and residual data can be found in " << of_name
<< std::endl;
}
Logger(std::shared_ptr< const gko::Executor > exec, const mask_type &enabled_events=all_events_mask)
Definition logger.hpp:674
void on_free_completed(const gko::Executor *exec, const gko::uintptr &) const override
Definition performance-debugging.cpp:99
void on_allocation_completed(const gko::Executor *exec, const gko::size_type &, const gko::uintptr &) const override
Definition performance-debugging.cpp:86
void on_operation_launched(const gko::Executor *exec, const gko::Operation *op) const override
Definition performance-debugging.cpp:121
void on_copy_started(const gko::Executor *from, const gko::Executor *to, const gko::uintptr &, const gko::uintptr &, const gko::size_type &) const override
Definition performance-debugging.cpp:105
void on_free_started(const gko::Executor *exec, const gko::uintptr &) const override
Definition performance-debugging.cpp:93
void on_allocation_started(const gko::Executor *exec, const gko::size_type &) const override
Definition performance-debugging.cpp:80
void on_copy_completed(const gko::Executor *from, const gko::Executor *to, const gko::uintptr &, const gko::uintptr &, const gko::size_type &) const override
Definition performance-debugging.cpp:113
void on_operation_completed(const gko::Executor *exec, const gko::Operation *op) const override
Definition performance-debugging.cpp:127
void on_iteration_complete(const gko::LinOp *, const gko::size_type &, const gko::LinOp *residual, const gko::LinOp *solution, const gko::LinOp *residual_norm) const override
Definition performance-debugging.cpp:201
void on_free_completed(const gko::Executor *, const gko::uintptr &location) const override
Definition performance-debugging.cpp:179
void on_allocation_completed(const gko::Executor *, const gko::size_type &num_bytes, const gko::uintptr &location) const override
Definition performance-debugging.cpp:172