Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ollama
Commits
6aad333c
Unverified
Commit
6aad333c
authored
May 09, 2024
by
Michael Yang
Committed by
GitHub
May 09, 2024
Browse files
Merge pull request #4298 from ollama/mxyng/log-cleanup
log clean up
parents
4fcc84e6
58876091
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
30 additions
and
34 deletions
+30
-34
llm/ext_server/server.cpp
llm/ext_server/server.cpp
+20
-24
llm/ext_server/utils.hpp
llm/ext_server/utils.hpp
+8
-5
llm/server.go
llm/server.go
+2
-5
No files found.
llm/ext_server/server.cpp
View file @
6aad333c
...
@@ -66,7 +66,7 @@ struct server_params {
...
@@ -66,7 +66,7 @@ struct server_params {
};
};
bool
server_verbose
=
false
;
bool
server_verbose
=
false
;
bool
server_log_json
=
tru
e
;
bool
server_log_json
=
fals
e
;
enum
stop_type
{
enum
stop_type
{
STOP_FULL
,
STOP_FULL
,
...
@@ -266,7 +266,7 @@ struct server_slot {
...
@@ -266,7 +266,7 @@ struct server_slot {
sprintf
(
buffer
,
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)"
,
sprintf
(
buffer
,
"prompt eval time = %10.2f ms / %5d tokens (%8.2f ms per token, %8.2f tokens per second)"
,
t_prompt_processing
,
n_prompt_tokens_processed
,
t_prompt_processing
,
n_prompt_tokens_processed
,
t_token
,
n_tokens_second
);
t_token
,
n_tokens_second
);
LOG_
INFO
(
buffer
,
{
LOG_
DEBUG
(
buffer
,
{
{
"slot_id"
,
id
},
{
"slot_id"
,
id
},
{
"task_id"
,
task_id
},
{
"task_id"
,
task_id
},
{
"t_prompt_processing"
,
t_prompt_processing
},
{
"t_prompt_processing"
,
t_prompt_processing
},
...
@@ -280,7 +280,7 @@ struct server_slot {
...
@@ -280,7 +280,7 @@ struct server_slot {
sprintf
(
buffer
,
"generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)"
,
sprintf
(
buffer
,
"generation eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)"
,
t_token_generation
,
n_decoded
,
t_token_generation
,
n_decoded
,
t_token
,
n_tokens_second
);
t_token
,
n_tokens_second
);
LOG_
INFO
(
buffer
,
{
LOG_
DEBUG
(
buffer
,
{
{
"slot_id"
,
id
},
{
"slot_id"
,
id
},
{
"task_id"
,
task_id
},
{
"task_id"
,
task_id
},
{
"t_token_generation"
,
t_token_generation
},
{
"t_token_generation"
,
t_token_generation
},
...
@@ -290,7 +290,7 @@ struct server_slot {
...
@@ -290,7 +290,7 @@ struct server_slot {
});
});
sprintf
(
buffer
,
" total time = %10.2f ms"
,
t_prompt_processing
+
t_token_generation
);
sprintf
(
buffer
,
" total time = %10.2f ms"
,
t_prompt_processing
+
t_token_generation
);
LOG_
INFO
(
buffer
,
{
LOG_
DEBUG
(
buffer
,
{
{
"slot_id"
,
id
},
{
"slot_id"
,
id
},
{
"task_id"
,
task_id
},
{
"task_id"
,
task_id
},
{
"t_prompt_processing"
,
t_prompt_processing
},
{
"t_prompt_processing"
,
t_prompt_processing
},
...
@@ -371,7 +371,7 @@ struct llama_server_context
...
@@ -371,7 +371,7 @@ struct llama_server_context
{
{
if
(
clp_ctx
)
if
(
clp_ctx
)
{
{
LOG_
INFO
(
"freeing clip model"
,
{});
LOG_
DEBUG
(
"freeing clip model"
,
{});
clip_free
(
clp_ctx
);
clip_free
(
clp_ctx
);
clp_ctx
=
nullptr
;
clp_ctx
=
nullptr
;
}
}
...
@@ -392,7 +392,7 @@ struct llama_server_context
...
@@ -392,7 +392,7 @@ struct llama_server_context
params
=
params_
;
params
=
params_
;
if
(
!
params
.
mmproj
.
empty
())
{
if
(
!
params
.
mmproj
.
empty
())
{
multimodal
=
true
;
multimodal
=
true
;
LOG_
INFO
(
"Multi Modal Mode Enabled"
,
{});
LOG_
DEBUG
(
"Multi Modal Mode Enabled"
,
{});
clp_ctx
=
clip_model_load
(
params
.
mmproj
.
c_str
(),
/*verbosity=*/
1
);
clp_ctx
=
clip_model_load
(
params
.
mmproj
.
c_str
(),
/*verbosity=*/
1
);
if
(
clp_ctx
==
nullptr
)
{
if
(
clp_ctx
==
nullptr
)
{
LOG_ERROR
(
"unable to load clip model"
,
{{
"model"
,
params
.
mmproj
}});
LOG_ERROR
(
"unable to load clip model"
,
{{
"model"
,
params
.
mmproj
}});
...
@@ -445,7 +445,7 @@ struct llama_server_context
...
@@ -445,7 +445,7 @@ struct llama_server_context
const
int32_t
n_ctx_slot
=
n_ctx
/
params
.
n_parallel
;
const
int32_t
n_ctx_slot
=
n_ctx
/
params
.
n_parallel
;
LOG_
INFO
(
"initializing slots"
,
{{
"n_slots"
,
params
.
n_parallel
}});
LOG_
DEBUG
(
"initializing slots"
,
{{
"n_slots"
,
params
.
n_parallel
}});
for
(
int
i
=
0
;
i
<
params
.
n_parallel
;
i
++
)
for
(
int
i
=
0
;
i
<
params
.
n_parallel
;
i
++
)
{
{
server_slot
slot
;
server_slot
slot
;
...
@@ -454,7 +454,7 @@ struct llama_server_context
...
@@ -454,7 +454,7 @@ struct llama_server_context
slot
.
n_ctx
=
n_ctx_slot
;
slot
.
n_ctx
=
n_ctx_slot
;
slot
.
n_predict
=
params
.
n_predict
;
slot
.
n_predict
=
params
.
n_predict
;
LOG_
INFO
(
"new slot"
,
{
LOG_
DEBUG
(
"new slot"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"n_ctx_slot"
,
slot
.
n_ctx
}
{
"n_ctx_slot"
,
slot
.
n_ctx
}
});
});
...
@@ -468,7 +468,7 @@ struct llama_server_context
...
@@ -468,7 +468,7 @@ struct llama_server_context
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx_train % ga_w == 0 && "n_ctx_train must be a multiple of ga_w"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
//GGML_ASSERT(n_ctx >= n_ctx_train * ga_n && "n_ctx must be at least n_ctx_train * ga_n"); // NOLINT
LOG_
INFO
(
"slot self-extend"
,
{
LOG_
DEBUG
(
"slot self-extend"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"ga_n"
,
ga_n
},
{
"ga_n"
,
ga_n
},
{
"ga_w"
,
ga_w
}
{
"ga_w"
,
ga_w
}
...
@@ -827,7 +827,7 @@ struct llama_server_context
...
@@ -827,7 +827,7 @@ struct llama_server_context
all_slots_are_idle
=
false
;
all_slots_are_idle
=
false
;
LOG_
INFO
(
"slot is processing task"
,
{
LOG_
DEBUG
(
"slot is processing task"
,
{
{
"slot_id"
,
slot
->
id
},
{
"slot_id"
,
slot
->
id
},
{
"task_id"
,
slot
->
task_id
},
{
"task_id"
,
slot
->
task_id
},
});
});
...
@@ -1504,7 +1504,7 @@ struct llama_server_context
...
@@ -1504,7 +1504,7 @@ struct llama_server_context
}
}
slots_data
.
push_back
(
slot_data
);
slots_data
.
push_back
(
slot_data
);
}
}
LOG_
INFO
(
"slot data"
,
{
LOG_
DEBUG
(
"slot data"
,
{
{
"task_id"
,
task
.
id
},
{
"task_id"
,
task
.
id
},
{
"n_idle_slots"
,
n_idle_slots
},
{
"n_idle_slots"
,
n_idle_slots
},
{
"n_processing_slots"
,
n_processing_slots
}
{
"n_processing_slots"
,
n_processing_slots
}
...
@@ -1566,7 +1566,7 @@ struct llama_server_context
...
@@ -1566,7 +1566,7 @@ struct llama_server_context
bool
update_slots
()
{
bool
update_slots
()
{
if
(
system_need_update
)
if
(
system_need_update
)
{
{
LOG_
INFO
(
"updating system prompt"
,
{});
LOG_
DEBUG
(
"updating system prompt"
,
{});
system_prompt_update
();
system_prompt_update
();
}
}
...
@@ -1576,7 +1576,7 @@ struct llama_server_context
...
@@ -1576,7 +1576,7 @@ struct llama_server_context
{
{
if
(
system_prompt
.
empty
()
&&
clean_kv_cache
)
if
(
system_prompt
.
empty
()
&&
clean_kv_cache
)
{
{
LOG_
INFO
(
"all slots are idle and system prompt is empty, clear the KV cache"
,
{});
LOG_
DEBUG
(
"all slots are idle and system prompt is empty, clear the KV cache"
,
{});
kv_cache_clear
();
kv_cache_clear
();
}
}
return
true
;
return
true
;
...
@@ -1599,7 +1599,7 @@ struct llama_server_context
...
@@ -1599,7 +1599,7 @@ struct llama_server_context
const
int
n_left
=
(
int
)
system_tokens
.
size
()
+
slot
.
n_past
-
n_keep
;
const
int
n_left
=
(
int
)
system_tokens
.
size
()
+
slot
.
n_past
-
n_keep
;
const
int
n_discard
=
n_left
/
2
;
const
int
n_discard
=
n_left
/
2
;
LOG_
INFO
(
"slot context shift"
,
{
LOG_
DEBUG
(
"slot context shift"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"task_id"
,
slot
.
task_id
},
{
"task_id"
,
slot
.
task_id
},
{
"n_keep"
,
n_keep
},
{
"n_keep"
,
n_keep
},
...
@@ -1638,7 +1638,7 @@ struct llama_server_context
...
@@ -1638,7 +1638,7 @@ struct llama_server_context
slot
.
command
=
NONE
;
slot
.
command
=
NONE
;
slot
.
t_last_used
=
ggml_time_us
();
slot
.
t_last_used
=
ggml_time_us
();
LOG_
INFO
(
"slot released"
,
{
LOG_
DEBUG
(
"slot released"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"task_id"
,
slot
.
task_id
},
{
"task_id"
,
slot
.
task_id
},
{
"n_ctx"
,
n_ctx
},
{
"n_ctx"
,
n_ctx
},
...
@@ -1807,7 +1807,7 @@ struct llama_server_context
...
@@ -1807,7 +1807,7 @@ struct llama_server_context
slot
.
ga_i
=
ga_i
;
slot
.
ga_i
=
ga_i
;
}
}
LOG_
INFO
(
"slot progression"
,
{
LOG_
DEBUG
(
"slot progression"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"task_id"
,
slot
.
task_id
},
{
"task_id"
,
slot
.
task_id
},
{
"n_past"
,
slot
.
n_past
},
{
"n_past"
,
slot
.
n_past
},
...
@@ -1822,7 +1822,7 @@ struct llama_server_context
...
@@ -1822,7 +1822,7 @@ struct llama_server_context
if
(
slot
.
n_past
==
slot
.
n_prompt_tokens
&&
slot
.
n_past
>
0
)
if
(
slot
.
n_past
==
slot
.
n_prompt_tokens
&&
slot
.
n_past
>
0
)
{
{
// we have to evaluate at least 1 token to generate logits.
// we have to evaluate at least 1 token to generate logits.
LOG_
INFO
(
"we have to evaluate at least 1 token to generate logits"
,
{
LOG_
DEBUG
(
"we have to evaluate at least 1 token to generate logits"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"task_id"
,
slot
.
task_id
}
{
"task_id"
,
slot
.
task_id
}
});
});
...
@@ -1834,7 +1834,7 @@ struct llama_server_context
...
@@ -1834,7 +1834,7 @@ struct llama_server_context
}
}
int
p0
=
(
int
)
system_tokens
.
size
()
+
slot
.
n_past
;
int
p0
=
(
int
)
system_tokens
.
size
()
+
slot
.
n_past
;
LOG_
INFO
(
"kv cache rm [p0, end)"
,
{
LOG_
DEBUG
(
"kv cache rm [p0, end)"
,
{
{
"slot_id"
,
slot
.
id
},
{
"slot_id"
,
slot
.
id
},
{
"task_id"
,
slot
.
task_id
},
{
"task_id"
,
slot
.
task_id
},
{
"p0"
,
p0
}
{
"p0"
,
p0
}
...
@@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
...
@@ -2491,11 +2491,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
}
}
else
if
(
arg
==
"-v"
||
arg
==
"--verbose"
)
else
if
(
arg
==
"-v"
||
arg
==
"--verbose"
)
{
{
#if SERVER_VERBOSE != 1
LOG_WARNING
(
"server.cpp is not built with verbose logging."
,
{});
#else
server_verbose
=
true
;
server_verbose
=
true
;
#endif
}
}
else
if
(
arg
==
"--mlock"
)
else
if
(
arg
==
"--mlock"
)
{
{
...
@@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
...
@@ -2601,7 +2597,7 @@ static void server_params_parse(int argc, char **argv, server_params &sparams,
else
if
(
arg
==
"--log-disable"
)
else
if
(
arg
==
"--log-disable"
)
{
{
log_set_target
(
stdout
);
log_set_target
(
stdout
);
LOG_
INFO
(
"logging to file is disabled."
,
{});
LOG_
DEBUG
(
"logging to file is disabled."
,
{});
}
}
else
if
(
arg
==
"--slots-endpoint-disable"
)
else
if
(
arg
==
"--slots-endpoint-disable"
)
{
{
...
@@ -2732,7 +2728,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
...
@@ -2732,7 +2728,7 @@ static void log_server_request(const httplib::Request &req, const httplib::Respo
return
;
return
;
}
}
LOG_
INFO
(
"request"
,
{
LOG_
DEBUG
(
"request"
,
{
{
"remote_addr"
,
req
.
remote_addr
},
{
"remote_addr"
,
req
.
remote_addr
},
{
"remote_port"
,
req
.
remote_port
},
{
"remote_port"
,
req
.
remote_port
},
{
"status"
,
res
.
status
},
{
"status"
,
res
.
status
},
...
...
llm/ext_server/utils.hpp
View file @
6aad333c
...
@@ -55,9 +55,10 @@ extern bool server_log_json;
...
@@ -55,9 +55,10 @@ extern bool server_log_json;
} while (0)
} while (0)
#endif
#endif
#define LOG_ERROR( MSG, ...) server_log("ERR", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_ERROR( MSG, ...) server_log("ERR
OR
", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_WARNING(MSG, ...) server_log("WARN", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_INFO( MSG, ...) server_log("INFO", __func__, __LINE__, MSG, __VA_ARGS__)
#define LOG_DEBUG( MSG, ...) server_log("DEBUG", __func__, __LINE__, MSG, __VA_ARGS__)
enum
server_state
{
enum
server_state
{
SERVER_STATE_LOADING_MODEL
,
// Server is starting up, model not fully loaded yet
SERVER_STATE_LOADING_MODEL
,
// Server is starting up, model not fully loaded yet
...
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
...
@@ -123,6 +124,10 @@ static inline void server_log(const char *level, const char *function, int line,
{
"timestamp"
,
time
(
nullptr
)},
{
"timestamp"
,
time
(
nullptr
)},
};
};
if
(
strncmp
(
"DEBUG"
,
level
,
strlen
(
level
))
==
0
&&
!
server_verbose
)
{
return
;
}
if
(
server_log_json
)
{
if
(
server_log_json
)
{
log
.
merge_patch
(
log
.
merge_patch
(
{
{
...
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,
...
@@ -137,14 +142,12 @@ static inline void server_log(const char *level, const char *function, int line,
std
::
cout
<<
log
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
)
<<
"
\n
"
<<
std
::
flush
;
std
::
cout
<<
log
.
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
)
<<
"
\n
"
<<
std
::
flush
;
}
else
{
}
else
{
char
buf
[
1024
];
snprintf
(
buf
,
1024
,
"%4s [%24s] %s"
,
level
,
function
,
message
);
if
(
!
extra
.
empty
())
{
if
(
!
extra
.
empty
())
{
log
.
merge_patch
(
extra
);
log
.
merge_patch
(
extra
);
}
}
std
::
stringstream
ss
;
std
::
stringstream
ss
;
ss
<<
buf
<<
" |"
;
ss
<<
level
<<
" ["
<<
function
<<
"] "
<<
message
<<
" |"
;
for
(
const
auto
&
el
:
log
.
items
())
for
(
const
auto
&
el
:
log
.
items
())
{
{
const
std
::
string
value
=
el
.
value
().
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
const
std
::
string
value
=
el
.
value
().
dump
(
-
1
,
' '
,
false
,
json
::
error_handler_t
::
replace
);
...
...
llm/server.go
View file @
6aad333c
...
@@ -157,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
...
@@ -157,11 +157,8 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--batch-size"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumBatch
),
"--embedding"
,
"--embedding"
,
}
}
if
envconfig
.
Debug
{
params
=
append
(
params
,
"--log-format"
,
"json"
)
params
=
append
(
params
,
"--log-disable"
)
}
else
{
params
=
append
(
params
,
"--log-disable"
)
}
if
opts
.
NumGPU
>=
0
{
if
opts
.
NumGPU
>=
0
{
params
=
append
(
params
,
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumGPU
))
params
=
append
(
params
,
"--n-gpu-layers"
,
fmt
.
Sprintf
(
"%d"
,
opts
.
NumGPU
))
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment