Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
bitsandbytes
Commits
332530ba
Unverified
Commit
332530ba
authored
Feb 05, 2024
by
Rickard
Committed by
GitHub
Feb 05, 2024
Browse files
quantize_block C->C++, use std::thread everywhere (#1024)
parent
8c507d92
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
27 additions
and
58 deletions
+27
-58
csrc/common.cpp
csrc/common.cpp
+10
-14
csrc/common.h
csrc/common.h
+1
-1
csrc/cpu_ops.cpp
csrc/cpu_ops.cpp
+16
-43
No files found.
csrc/common.cpp
View file @
332530ba
#include <common.h>
#include <float.h>
void
*
quantize_block
(
void
*
argument
s
)
{
void
quantize_block
(
const
quantize_block_args
&
arg
s
)
{
// 1. find absmax in block
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
// 4. check minimal distance
// 5. store index
struct
quantize_block_args
*
args
=
(
quantize_block_args
*
)
arguments
;
// 1. find absmax in block
float
absmax_block
=
-
FLT_MAX
;
for
(
long
long
i
=
args
->
block_idx
;
i
<
args
->
block_end
;
i
++
)
absmax_block
=
fmax
(
absmax_block
,
fabs
(
args
->
A
[
i
]));
for
(
long
long
i
=
args
.
block_idx
;
i
<
args
.
block_end
;
i
++
)
absmax_block
=
fmax
(
absmax_block
,
fabs
(
args
.
A
[
i
]));
args
->
absmax
[
args
->
block_idx
/
args
->
blocksize
]
=
absmax_block
;
args
.
absmax
[
args
.
block_idx
/
args
.
blocksize
]
=
absmax_block
;
for
(
long
long
i
=
args
->
block_idx
;
i
<
args
->
block_end
;
i
++
)
{
for
(
long
long
i
=
args
.
block_idx
;
i
<
args
.
block_end
;
i
++
)
{
// 2. divide input value by absmax to normalize into [-1.0, 1.0]
// 3. do binary search to find the closest value
float
normed_value
=
args
->
A
[
i
]
/
absmax_block
;
long
long
idx
=
args
->
bin_searcher
->
scalar
(
normed_value
);
float
normed_value
=
args
.
A
[
i
]
/
absmax_block
;
long
long
idx
=
args
.
bin_searcher
->
scalar
(
normed_value
);
// 4. check minimal distance
// The binary search returns always the value to the left, which might not be the closest value
if
(
idx
<
255
)
{
float
dist_left
=
fabs
(
normed_value
-
(
args
->
code
[
idx
]));
float
dist_right
=
fabs
(
normed_value
-
(
args
->
code
[
idx
+
1
]));
float
dist_left
=
fabs
(
normed_value
-
(
args
.
code
[
idx
]));
float
dist_right
=
fabs
(
normed_value
-
(
args
.
code
[
idx
+
1
]));
if
(
dist_right
<
dist_left
)
{
idx
+=
1
;
}
}
// 5. store index
args
->
out
[
i
]
=
(
unsigned
char
)
idx
;
args
.
out
[
i
]
=
(
unsigned
char
)
idx
;
}
return
NULL
;
}
csrc/common.h
View file @
332530ba
...
...
@@ -20,6 +20,6 @@ struct quantize_block_args {
};
void
*
quantize_block
(
void
*
argument
s
);
void
quantize_block
(
const
quantize_block_args
&
arg
s
);
#endif
csrc/cpu_ops.cpp
View file @
332530ba
#include <BinSearch.h>
#ifdef _WIN32
#include <thread>
#else
#include <pthread.h>
#endif
#include <common.h>
#include <thread>
using
namespace
BinSearch
;
...
...
@@ -30,21 +26,13 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
BinAlgo
<
Scalar
,
float
,
Direct2
>
bin_searcher
(
code
,
elements_code
);
int
thread_wave_size
=
256
;
// we chunk the thre
s
ds into waves of 256 since the max limit is
// we chunk the thre
a
ds into waves of 256 since the max limit is
// between 16k and 64k on Linux (we reach this when running BLOOM-176B with a large batch size)
for
(
long
long
offset
=
0
;
offset
<
num_blocks
;
offset
+=
thread_wave_size
)
{
long
long
valid_chunks
=
num_blocks
-
offset
>=
thread_wave_size
?
thread_wave_size
:
num_blocks
-
offset
;
#ifdef _WIN32
std
::
thread
*
threads
=
(
std
::
thread
*
)
malloc
(
sizeof
(
std
::
thread
)
*
valid_chunks
);
#else
pthread_t
*
threads
=
(
pthread_t
*
)
malloc
(
sizeof
(
pthread_t
)
*
valid_chunks
);
#endif
struct
quantize_block_args
**
args
=
(
quantize_block_args
**
)
malloc
(
valid_chunks
*
sizeof
(
quantize_block_args
*
));
for
(
long
long
i
=
0
;
i
<
valid_chunks
;
i
++
)
args
[
i
]
=
(
quantize_block_args
*
)
malloc
(
sizeof
(
quantize_block_args
));
std
::
vector
<
std
::
thread
>
threads
(
valid_chunks
);
std
::
vector
<
quantize_block_args
>
args
(
valid_chunks
);
int
chunks_processed
=
0
;
for
(
long
long
block_idx
=
offset
*
blocksize
;
block_idx
<
n
;
block_idx
+=
blocksize
)
...
...
@@ -52,39 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
long
long
valid_items
=
n
-
block_idx
>=
blocksize
?
blocksize
:
n
-
block_idx
;
long
long
block_end
=
block_idx
+
valid_items
;
struct
quantize_block_args
*
arg
=
args
[
chunks_processed
];
arg
->
bin_searcher
=
&
bin_searcher
;
arg
->
code
=
code
;
arg
->
A
=
A
;
arg
->
absmax
=
absmax
;
arg
->
out
=
out
;
arg
->
block_end
=
block_end
;
arg
->
block_idx
=
block_idx
;
arg
->
threadidx
=
block_idx
/
blocksize
;
arg
->
blocksize
=
blocksize
;
#ifdef _WIN32
new
(
&
threads
[
chunks_processed
])
std
::
thread
(
quantize_block
,
arg
);
#else
pthread_create
(
&
threads
[
chunks_processed
],
NULL
,
&
quantize_block
,
(
void
*
)
arg
);
#endif
struct
quantize_block_args
&
arg
=
args
[
chunks_processed
];
arg
.
bin_searcher
=
&
bin_searcher
;
arg
.
code
=
code
;
arg
.
A
=
A
;
arg
.
absmax
=
absmax
;
arg
.
out
=
out
;
arg
.
block_end
=
block_end
;
arg
.
block_idx
=
block_idx
;
arg
.
threadidx
=
block_idx
/
blocksize
;
arg
.
blocksize
=
blocksize
;
threads
[
chunks_processed
]
=
std
::
thread
([
arg
]
{
quantize_block
(
arg
);
});
chunks_processed
+=
1
;
if
(
chunks_processed
==
valid_chunks
){
break
;
}
}
for
(
int
i
=
0
;
i
<
valid_chunks
;
i
++
)
{
#ifdef _WIN32
threads
[
i
].
join
();
#else
int
err
=
pthread_join
(
threads
[
i
],
NULL
);
#endif
}
free
(
threads
);
for
(
int
i
=
0
;
i
<
valid_chunks
;
i
++
)
free
(
args
[
i
]);
free
(
args
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment