Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
LLama_fastertransformer
Commits
e2f34da7
"vscode:/vscode.git/clone" did not exist on "2764db3194fc1b5069df7292fd938657d8568995"
Commit
e2f34da7
authored
Oct 13, 2023
by
zhuwenwen
Browse files
fix memory leak
parent
984f2cde
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
10 additions
and
9 deletions
+10
-9
src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
...ertransformer/layers/sampling_layers/BaseSamplingLayer.cc
+8
-8
src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
...n_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
+2
-1
No files found.
src/fastertransformer/layers/sampling_layers/BaseSamplingLayer.cc
View file @
e2f34da7
...
@@ -44,10 +44,10 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
...
@@ -44,10 +44,10 @@ void BaseSamplingLayer<T>::allocateBuffer(size_t batch_size, Tensor top_k, Tenso
reinterpret_cast
<
bool
*>
(
allocator_
->
reMalloc
(
skip_decode_buf_
,
sizeof
(
bool
)
*
batch_size
,
false
));
reinterpret_cast
<
bool
*>
(
allocator_
->
reMalloc
(
skip_decode_buf_
,
sizeof
(
bool
)
*
batch_size
,
false
));
// host buffers.
// host buffers.
temperature_
=
new
float
[
batch_size
]
;
temperature_
=
(
float
*
)
std
::
realloc
((
void
*
)
temperature_
,
batch_size
*
sizeof
(
float
))
;
repetition_penalty_
=
new
float
[
batch_size
]
;
repetition_penalty_
=
(
float
*
)
std
::
realloc
((
void
*
)
repetition_penalty_
,
batch_size
*
sizeof
(
float
))
;
min_lengths_
=
new
int
[
batch_size
]
;
min_lengths_
=
(
int
*
)
std
::
realloc
((
void
*
)
min_lengths_
,
batch_size
*
sizeof
(
int
))
;
skip_decode_
=
new
bool
[
batch_size
]
;
skip_decode_
=
(
bool
*
)
std
::
realloc
((
void
*
)
skip_decode_
,
batch_size
*
sizeof
(
bool
))
;
is_allocate_buffer_
=
true
;
is_allocate_buffer_
=
true
;
}
}
...
@@ -64,10 +64,10 @@ void BaseSamplingLayer<T>::freeBuffer()
...
@@ -64,10 +64,10 @@ void BaseSamplingLayer<T>::freeBuffer()
allocator_
->
free
((
void
**
)(
&
min_lengths_buf_
));
allocator_
->
free
((
void
**
)(
&
min_lengths_buf_
));
allocator_
->
free
((
void
**
)(
&
runtime_logits_buf_
));
allocator_
->
free
((
void
**
)(
&
runtime_logits_buf_
));
allocator_
->
free
((
void
**
)(
&
skip_decode_buf_
));
allocator_
->
free
((
void
**
)(
&
skip_decode_buf_
));
delete
[]
temperature_
;
std
::
free
(
temperature_
)
;
delete
[]
repetition_penalty_
;
std
::
free
(
repetition_penalty_
)
;
delete
[]
min_lengths_
;
std
::
free
(
min_lengths_
)
;
delete
[]
skip_decode_
;
std
::
free
(
skip_decode_
)
;
is_allocate_buffer_
=
false
;
is_allocate_buffer_
=
false
;
}
}
}
}
...
...
src/fastertransformer/triton_backend/multi_gpu_gpt/ParallelGptTritonModelInstance.cc
View file @
e2f34da7
...
@@ -69,8 +69,8 @@ std::unordered_map<std::string, ft::Tensor> ParallelGptTritonModelInstance<T>::c
...
@@ -69,8 +69,8 @@ std::unordered_map<std::string, ft::Tensor> ParallelGptTritonModelInstance<T>::c
move_tensor_H2D
(
input_tensors
->
at
(
"input_ids"
),
d_input_ids_
,
&
allocator_
);
move_tensor_H2D
(
input_tensors
->
at
(
"input_ids"
),
d_input_ids_
,
&
allocator_
);
move_tensor_H2D
(
input_tensors
->
at
(
"input_lengths"
),
d_input_lengths_
,
&
allocator_
);
move_tensor_H2D
(
input_tensors
->
at
(
"input_lengths"
),
d_input_lengths_
,
&
allocator_
);
h_total_output_lengths_
=
(
uint32_t
*
)
std
::
realloc
((
void
*
)
h_total_output_lengths_
,
request_batch_size
*
sizeof
(
uint32_t
));
const
int
input_data_len
=
input_tensors
->
at
(
"input_ids"
).
shape
[
1
];
const
int
input_data_len
=
input_tensors
->
at
(
"input_ids"
).
shape
[
1
];
h_total_output_lengths_
=
reinterpret_cast
<
uint32_t
*>
(
malloc
(
request_batch_size
*
sizeof
(
uint32_t
)));
const
bool
continue_interactive
=
const
bool
continue_interactive
=
input_tensors
->
count
(
"START"
)
&&
reinterpret_cast
<
const
int32_t
*>
(
input_tensors
->
at
(
"START"
).
data
)[
0
]
==
0
;
input_tensors
->
count
(
"START"
)
&&
reinterpret_cast
<
const
int32_t
*>
(
input_tensors
->
at
(
"START"
).
data
)[
0
]
==
0
;
for
(
int
i
=
0
;
i
<
request_batch_size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
request_batch_size
;
++
i
)
{
...
@@ -293,6 +293,7 @@ void ParallelGptTritonModelInstance<T>::freeBuffer()
...
@@ -293,6 +293,7 @@ void ParallelGptTritonModelInstance<T>::freeBuffer()
allocator_
->
free
((
void
**
)(
&
d_output_ctx_emb_
));
allocator_
->
free
((
void
**
)(
&
d_output_ctx_emb_
));
allocator_
->
free
((
void
**
)(
&
d_cum_log_probs_
));
allocator_
->
free
((
void
**
)(
&
d_cum_log_probs_
));
allocator_
->
free
((
void
**
)(
&
d_is_finished_
));
allocator_
->
free
((
void
**
)(
&
d_is_finished_
));
std
::
free
(
h_total_output_lengths_
);
}
}
template
struct
ParallelGptTritonModelInstance
<
float
>;
template
struct
ParallelGptTritonModelInstance
<
float
>;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment