performance-metrics: add restore latency performance metric

This patch calculates the recovery latency from the start of
recovery program to the end of restore.

Signed-off-by: Songqian Li <sionli@tencent.com>
This commit is contained in:
Songqian Li 2024-07-19 10:49:22 +08:00 committed by Rob Bradford
parent 59185050d5
commit e18d32bac2
3 changed files with 183 additions and 2 deletions

View File

@ -31,6 +31,7 @@ $ ./scripts/dev_cli.sh tests --metrics -- -- --report-file /tmp/metrics.json --t
``` ```
To set custom timeout or test iterations for all performance tests: To set custom timeout or test iterations for all performance tests:
``` ```
$ ./scripts/dev_cli.sh tests --metrics -- -- --timeout 5 --iterations 10 $ ./scripts/dev_cli.sh tests --metrics -- -- --timeout 5 --iterations 10
``` ```
@ -70,6 +71,7 @@ data.
| | block_multi_queue_write_IOPS | 10 | 5 | | | block_multi_queue_write_IOPS | 10 | 5 |
| | block_multi_queue_random_read_IOPS | 10 | 5 | | | block_multi_queue_random_read_IOPS | 10 | 5 |
| | block_multi_queue_random_write_IOPS | 10 | 5 | | | block_multi_queue_random_write_IOPS | 10 | 5 |
| Other | restore_latency_time_ms | 2 | 10 |
## Output Format ## Output Format
@ -84,7 +86,6 @@ param is set. The fields included in JSON include:
| date | Date for executing the program | | date | Date for executing the program |
| results | A list of metrics | | results | A list of metrics |
## Example ## Example
Here is an example of generating metrics data for the boot time using Here is an example of generating metrics data for the boot time using

View File

@ -274,7 +274,7 @@ mod adjuster {
} }
} }
const TEST_LIST: [PerformanceTest; 29] = [ const TEST_LIST: [PerformanceTest; 30] = [
PerformanceTest { PerformanceTest {
name: "boot_time_ms", name: "boot_time_ms",
func_ptr: performance_boot_time, func_ptr: performance_boot_time,
@ -306,6 +306,16 @@ const TEST_LIST: [PerformanceTest; 29] = [
}, },
unit_adjuster: adjuster::s_to_ms, unit_adjuster: adjuster::s_to_ms,
}, },
PerformanceTest {
name: "restore_latency_time_ms",
func_ptr: performance_restore_latency,
control: PerformanceTestControl {
test_timeout: 2,
test_iterations: 10,
..PerformanceTestControl::default()
},
unit_adjuster: adjuster::identity,
},
PerformanceTest { PerformanceTest {
name: "boot_time_16_vcpus_pmem_ms", name: "boot_time_16_vcpus_pmem_ms",
func_ptr: performance_boot_time_pmem, func_ptr: performance_boot_time_pmem,

View File

@ -23,6 +23,7 @@ pub const FOCAL_IMAGE_NAME: &str = "focal-server-cloudimg-arm64-custom-20210929-
enum Error { enum Error {
BootTimeParse, BootTimeParse,
Infra(InfraError), Infra(InfraError),
RestoreTimeParse,
} }
impl From<InfraError> for Error { impl From<InfraError> for Error {
@ -76,6 +77,26 @@ fn direct_kernel_boot_path() -> PathBuf {
kernel_path kernel_path
} }
fn remote_command(api_socket: &str, command: &str, arg: Option<&str>) -> bool {
let mut cmd = std::process::Command::new(clh_command("ch-remote"));
cmd.args([&format!("--api-socket={}", api_socket), command]);
if let Some(arg) = arg {
cmd.arg(arg);
}
let output = cmd.output().unwrap();
if output.status.success() {
true
} else {
eprintln!(
"Error running ch-remote command: {:?}\nstderr: {}",
&cmd,
String::from_utf8_lossy(&output.stderr)
);
false
}
}
pub fn performance_net_throughput(control: &PerformanceTestControl) -> f64 { pub fn performance_net_throughput(control: &PerformanceTestControl) -> f64 {
let test_timeout = control.test_timeout; let test_timeout = control.test_timeout;
let (rx, bandwidth) = control.net_control.unwrap(); let (rx, bandwidth) = control.net_control.unwrap();
@ -404,6 +425,120 @@ pub fn performance_block_io(control: &PerformanceTestControl) -> f64 {
} }
} }
// Parse the event_monitor file based on the format that each event
// is followed by a double newline
fn parse_event_file(event_file: &str) -> Vec<serde_json::Value> {
let content = fs::read(event_file).unwrap();
let mut ret = Vec::new();
for entry in String::from_utf8_lossy(&content)
.trim()
.split("\n\n")
.collect::<Vec<&str>>()
{
ret.push(serde_json::from_str(entry).unwrap());
}
ret
}
fn parse_restore_time_output(events: &[serde_json::Value]) -> Result<f64, Error> {
for entry in events.iter() {
if entry["event"].as_str().unwrap() == "restored" {
let duration = entry["timestamp"]["secs"].as_u64().unwrap() as f64 * 1_000f64
+ entry["timestamp"]["nanos"].as_u64().unwrap() as f64 / 1_000_000f64;
return Ok(duration);
}
}
Err(Error::RestoreTimeParse)
}
fn measure_restore_time(
cmd: &mut GuestCommand,
event_file: &str,
test_timeout: u32,
) -> Result<f64, Error> {
let mut child = cmd
.capture_output()
.verbosity(VerbosityLevel::Warn)
.set_print_cmd(false)
.spawn()
.unwrap();
thread::sleep(Duration::new((test_timeout / 2) as u64, 0));
let _ = child.kill();
let output = child.wait_with_output().unwrap();
let json_events = parse_event_file(event_file);
parse_restore_time_output(&json_events).inspect_err(|_| {
eprintln!(
"\n\n==== Start child stdout ====\n\n{}\n\n==== End child stdout ====\
\n\n==== Start child stderr ====\n\n{}\n\n==== End child stderr ====",
String::from_utf8_lossy(&output.stdout),
String::from_utf8_lossy(&output.stderr)
)
})
}
pub fn performance_restore_latency(control: &PerformanceTestControl) -> f64 {
let r = std::panic::catch_unwind(|| {
let focal = UbuntuDiskConfig::new(FOCAL_IMAGE_NAME.to_string());
let guest = performance_test_new_guest(Box::new(focal));
let api_socket_source = String::from(
guest
.tmp_dir
.as_path()
.join("cloud-hypervisor.sock")
.to_str()
.unwrap(),
);
let mut child = GuestCommand::new(&guest)
.args(["--api-socket", &api_socket_source])
.args([
"--cpus",
&format!("boot={}", control.num_boot_vcpus.unwrap_or(1)),
])
.args(["--memory", "size=256M"])
.args(["--kernel", direct_kernel_boot_path().to_str().unwrap()])
.args(["--cmdline", DIRECT_KERNEL_BOOT_CMDLINE])
.args(["--console", "off"])
.default_disks()
.set_print_cmd(false)
.spawn()
.unwrap();
thread::sleep(Duration::new((control.test_timeout / 2) as u64, 0));
let snapshot_dir = String::from(guest.tmp_dir.as_path().join("snapshot").to_str().unwrap());
std::fs::create_dir(&snapshot_dir).unwrap();
assert!(remote_command(&api_socket_source, "pause", None));
assert!(remote_command(
&api_socket_source,
"snapshot",
Some(format!("file://{}", snapshot_dir).as_str()),
));
let _ = child.kill();
let event_path = String::from(guest.tmp_dir.as_path().join("event.json").to_str().unwrap());
let mut cmd = GuestCommand::new(&guest);
let c = cmd
.args([
"--restore",
format!("source_url=file://{}", snapshot_dir).as_str(),
])
.args(["--event-monitor", format!("path={}", event_path).as_str()]);
measure_restore_time(c, event_path.as_str(), control.test_timeout).unwrap()
});
match r {
Ok(r) => r,
Err(_) => {
panic!("test failed!");
}
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
@ -494,7 +629,42 @@ cloud-hypervisor: 613.57361ms: <vcpu0> INFO:vmm/src/vm.rs:392 -- [Debug I/O port
assert_eq!(parse_boot_time_output(output.as_bytes()).unwrap(), 0.4525); assert_eq!(parse_boot_time_output(output.as_bytes()).unwrap(), 0.4525);
} }
#[test]
fn test_parse_restore_time_output() {
let output = r#"
{
"timestamp": {
"secs": 0,
"nanos": 4664404
},
"source": "virtio-device",
"event": "activated",
"properties": {
"id": "__rng"
}
}
{
"timestamp": {
"secs": 0,
"nanos": 5505133
},
"source": "vm",
"event": "restored",
"properties": null
}
"#;
let mut ret = Vec::new();
for entry in String::from(output)
.trim()
.split("\n\n")
.collect::<Vec<&str>>()
{
ret.push(serde_json::from_str(entry).unwrap());
}
assert_eq!(parse_restore_time_output(&ret).unwrap(), 5.505133_f64);
}
#[test] #[test]
fn test_parse_fio_output() { fn test_parse_fio_output() {
let output = r#" let output = r#"