-
Notifications
You must be signed in to change notification settings - Fork 23
/
Copy pathdefault.yml
109 lines (92 loc) · 2.46 KB
/
default.yml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
host: localhost
port: 8000
seed: 42
log_level: info
output_dir: ./benchmark_output/
write_json_trace: true
write_chrome_trace: true
write_metrics: true
enable_profiling: false
gpu_memory_utilization: 0.85
time_limit: 10281800 # seconds
replica_resource_mapping: ""
cluster:
num_replicas: 1
model:
name: 01-ai/Yi-6B-200k
tensor_parallel_degree: 1
pipeline_parallel_degree: 1
max_model_len: 65536
load_format: dummy
attention_backend: fa_vattn
block_size: 2097152
request_generator:
provider: synthetic
synthetic_request_generator:
length_provider: trace
interval_provider: static
num_requests: 64
trace_request_generator:
trace_file: ./data/processed_traces/sydney_enterprise.csv
date: '2023-08-21'
prefill_scale_factor: 0.3
decode_scale_factor: 1
time_scale_factor: 0.04
max_tokens: 32768
# Config for synthetic trace generator
trace_request_length_generator:
# trace_file: ./data/processed_traces/sharegpt_8k_filtered_stats_llama2_tokenizer.csv
trace_file: ./data/processed_traces/arxiv_summarization_filtered_stats_llama2_tokenizer.csv
prefill_scale_factor: 1
decode_scale_factor: 1
max_tokens: 32768
min_tokens: 8192
trace_request_interval_generator:
trace_file: ./data/processed_traces/AzureFunctionsInvocationTraceForTwoWeeksJan2021Processed.csv
start_time: "1970-01-04 12:00:00"
end_time: "1970-01-04 15:00:00"
time_scale_factor: 0.3
poisson_request_interval_generator:
qps: 1.0
gamma_request_interval_generator:
cv: 0.5
qps: 0.2
zipf_request_length_generator:
theta: 0.6
scramble: false
min_tokens: 1024
max_tokens: 4096
prefill_to_decode_ratio: 20.0
uniform_request_length_generator:
min_tokens: 1024
max_tokens: 4096
prefill_to_decode_ratio: 20.0
fixed_request_length_generator:
prefill_tokens: 4096
decode_tokens: 512
replica_scheduler:
provider: vllm
max_batch_size: 128
sarathi_scheduler:
chunk_size: 512
enable_dynamic_chunking_schedule: false
# next four params apply only when using dynamic schedule
low_chunk_size: 128
high_chunk_size: 2048
# chunk_schedule_max_tokens: 4096
chunk_schedule_max_tokens: 131072
chunk_schedule_stages: 16
vllm_scheduler:
max_tokens_in_batch: null
simple_chunking_scheduler:
chunk_size: 512
metrics_store:
wandb_project: ""
wandb_sweep_id: ""
wandb_run_id: ""
wandb_group: ""
wandb_run_name: ""
enable_op_level_metrics: false
enable_cpu_op_level_metrics: false
enable_request_outputs: false
keep_individual_batch_metrics: false