Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ktransformers
Commits
18c42e67
Commit
18c42e67
authored
Jul 27, 2024
by
chenxl
Browse files
Initial commit
parents
Changes
247
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
4477 additions
and
0 deletions
+4477
-0
ktransformers/website/src/router/index.ts
ktransformers/website/src/router/index.ts
+24
-0
ktransformers/website/src/shims-vue.d.ts
ktransformers/website/src/shims-vue.d.ts
+10
-0
ktransformers/website/src/store/index.ts
ktransformers/website/src/store/index.ts
+14
-0
ktransformers/website/src/utils/copy.ts
ktransformers/website/src/utils/copy.ts
+112
-0
ktransformers/website/src/utils/types.ts
ktransformers/website/src/utils/types.ts
+126
-0
ktransformers/website/src/views/home.vue
ktransformers/website/src/views/home.vue
+718
-0
ktransformers/website/tests/unit/example.spec.ts
ktransformers/website/tests/unit/example.spec.ts
+12
-0
ktransformers/website/tsconfig.json
ktransformers/website/tsconfig.json
+46
-0
ktransformers/website/vue.config.js
ktransformers/website/vue.config.js
+41
-0
pyproject.toml
pyproject.toml
+8
-0
requirements-local_chat.txt
requirements-local_chat.txt
+5
-0
setup.py
setup.py
+250
-0
third_party/llama.cpp
third_party/llama.cpp
+1
-0
third_party/llamafile/README.md
third_party/llamafile/README.md
+1
-0
third_party/llamafile/bench.h
third_party/llamafile/bench.h
+25
-0
third_party/llamafile/flags.cpp
third_party/llamafile/flags.cpp
+8
-0
third_party/llamafile/flags.h
third_party/llamafile/flags.h
+8
-0
third_party/llamafile/iqk_mul_mat.inc
third_party/llamafile/iqk_mul_mat.inc
+3050
-0
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
+8
-0
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
+10
-0
No files found.
ktransformers/website/src/router/index.ts
0 → 100644
View file @
18c42e67
import
{
createRouter
,
createWebHashHistory
,
RouteRecordRaw
,
createWebHistory
}
from
'
vue-router
'
import
HomeView
from
'
@/views/home.vue
'
const
routes
:
Array
<
RouteRecordRaw
>
=
[
{
path
:
'
/
'
,
name
:
'
home
'
,
component
:
HomeView
,
redirect
:
'
/chat
'
,
children
:
[{
path
:
'
/chat
'
,
name
:
''
,
component
:
()
=>
import
(
/* webpackChunkName: "about" */
'
../components/chat/index.vue
'
)
},]
},
]
const
router
=
createRouter
({
history
:
createWebHashHistory
(),
routes
})
export
default
router
ktransformers/website/src/shims-vue.d.ts
0 → 100644
View file @
18c42e67
/* eslint-disable */
declare
module
'
*.vue
'
{
import
type
{
DefineComponent
}
from
'
vue
'
const
component
:
DefineComponent
<
{},
{},
any
>
export
default
component
}
declare
module
'
@/locals
'
declare
module
'
pdfobject
'
;
ktransformers/website/src/store/index.ts
0 → 100644
View file @
18c42e67
import
{
createStore
}
from
'
vuex
'
export
default
createStore
({
state
:
{
},
getters
:
{
},
mutations
:
{
},
actions
:
{
},
modules
:
{
}
})
ktransformers/website/src/utils/copy.ts
0 → 100644
View file @
18c42e67
import
{
ElMessage
}
from
"
element-plus
"
;
const
copy
=
(
value
:
string
)
=>
{
//Try using the navigator.clipboard.writeText method
if
(
navigator
.
clipboard
&&
window
.
isSecureContext
)
{
navigator
.
clipboard
.
writeText
(
value
)
.
then
(()
=>
{
//Using ElMessage to Display Success Messages in Windows Systems
if
(
navigator
.
appVersion
.
includes
(
"
Win
"
))
{
ElMessage
({
message
:
"
内容复制成功!
"
,
type
:
"
success
"
,
plain
:
true
,
});
}
else
{
//Using custom DOM elements to display success messages in macOS system
showCopySuccessMessage
();
}
})
.
catch
(()
=>
{
//Using ElMessage to Display Failure Messages in Windows Systems
if
(
navigator
.
appVersion
.
includes
(
"
Win
"
))
{
ElMessage
({
message
:
"
内容复制失败!
"
,
type
:
"
error
"
,
plain
:
true
,
});
}
else
{
//Using custom DOM elements to display failure messages in macOS system
showCopyErrorMessage
();
}
});
}
else
{
const
textarea
=
document
.
createElement
(
"
textarea
"
);
textarea
.
value
=
value
;
document
.
body
.
appendChild
(
textarea
);
textarea
.
select
();
try
{
const
successful
=
document
.
execCommand
(
'
copy
'
);
if
(
successful
)
{
if
(
navigator
.
appVersion
.
includes
(
"
Win
"
))
{
ElMessage
({
message
:
"
内容复制成功!
"
,
type
:
"
success
"
,
plain
:
true
,
});
}
else
{
showCopySuccessMessage
();
}
}
else
{
if
(
navigator
.
appVersion
.
includes
(
"
Win
"
))
{
ElMessage
({
message
:
"
内容复制失败!
"
,
type
:
"
error
"
,
plain
:
true
,
});
}
else
{
showCopyErrorMessage
();
}
}
}
catch
(
err
)
{
if
(
navigator
.
appVersion
.
includes
(
"
Win
"
))
{
ElMessage
({
message
:
"
内容复制失败!
"
,
type
:
"
error
"
,
plain
:
true
,
});
}
else
{
showCopyErrorMessage
();
}
}
document
.
body
.
removeChild
(
textarea
);
}
};
function
showCopySuccessMessage
()
{
const
messageElement
=
document
.
createElement
(
'
div
'
);
messageElement
.
textContent
=
'
内容复制成功!
'
;
messageElement
.
style
.
position
=
'
fixed
'
;
messageElement
.
style
.
bottom
=
'
10px
'
;
messageElement
.
style
.
left
=
'
50%
'
;
messageElement
.
style
.
transform
=
'
translateX(-50%)
'
;
messageElement
.
style
.
padding
=
'
10px
'
;
messageElement
.
style
.
backgroundColor
=
'
#4CAF50
'
;
messageElement
.
style
.
color
=
'
white
'
;
messageElement
.
style
.
borderRadius
=
'
15px
'
;
messageElement
.
style
.
zIndex
=
'
1000
'
;
document
.
body
.
appendChild
(
messageElement
);
setTimeout
(()
=>
{
document
.
body
.
removeChild
(
messageElement
);
},
3000
);
}
function
showCopyErrorMessage
()
{
const
messageElement
=
document
.
createElement
(
'
div
'
);
messageElement
.
textContent
=
'
内容复制失败!
'
;
messageElement
.
style
.
position
=
'
fixed
'
;
messageElement
.
style
.
bottom
=
'
10px
'
;
messageElement
.
style
.
left
=
'
50%
'
;
messageElement
.
style
.
transform
=
'
translateX(-50%)
'
;
messageElement
.
style
.
padding
=
'
10px
'
;
messageElement
.
style
.
backgroundColor
=
'
#F44336
'
;
messageElement
.
style
.
color
=
'
white
'
;
messageElement
.
style
.
borderRadius
=
'
5px
'
;
messageElement
.
style
.
zIndex
=
'
1000
'
;
document
.
body
.
appendChild
(
messageElement
);
setTimeout
(()
=>
{
document
.
body
.
removeChild
(
messageElement
);
},
3000
);
}
export
default
copy
;
\ No newline at end of file
ktransformers/website/src/utils/types.ts
0 → 100644
View file @
18c42e67
export
interface
IAssistant
{
id
:
string
;
object
:
string
;
created_at
:
number
;
name
?:
string
;
description
?:
string
;
model
:
string
;
instructions
?:
string
;
tools
:
any
[];
tool_resources
?:
object
;
metadata
?:{[
key
:
string
]:
any
}
top_p
?:
number
;
temperature
?:
number
;
response_format
:
string
|
object
;
}
export
interface
IAssistantWithStatus
{
build_status
:{
status
:
string
}
id
:
string
;
object
:
string
;
created_at
:
number
;
name
?:
string
;
description
?:
string
;
model
:
string
;
instructions
?:
string
;
tools
:
any
[];
tool_resources
?:
object
;
metadata
?:{[
key
:
string
]:
any
}
top_p
?:
number
;
temperature
?:
number
;
response_format
:
string
|
object
;
}
export
interface
IMessage
{
id
:
string
;
object
:
string
;
created_at
:
number
;
thread_id
:
string
;
status
:
string
;
incomplete_details
?:
object
;
completed_at
?:
number
;
incomplete_at
?:
number
;
role
:
string
;
content
:
any
[];
assistant_id
?:
string
;
run_id
?:
string
;
attachments
?:
any
[];
metadata
:{[
key
:
string
]:
any
}
}
export
interface
IThread
{
id
:
string
;
object
:
string
;
created_at
:
number
;
tool_resources
?:
object
;
metadata
?:{[
key
:
string
]:
any
}
}
export
interface
IRun
{
id
:
string
;
object
:
string
;
created_at
:
number
;
thread_id
:
string
,
assistant_id
:
string
,
status
:
string
,
required_action
?:
object
,
last_error
?:
object
,
expires_at
?:
number
,
started_at
?:
number
,
cancelled_at
?:
number
,
failed_at
?:
number
,
completed_at
?:
number
,
incomplete_details
?:
object
,
model
:
string
,
instructions
:
string
,
tools
:
any
[],
metadata
:
Map
<
string
,
string
>
,
usage
?:
object
,
temperature
?:
number
,
top_p
?:
number
,
max_prompt_tokens
?:
number
,
max_completion_tokens
?:
number
,
truncation_strategy
:
object
,
tool_choice
:
string
|
object
,
response_format
:
string
|
object
,
}
export
interface
IFile
{
id
:
string
,
bytes
:
number
,
created_at
:
number
,
filename
:
string
,
object
:
string
,
purpose
:
string
,
}
export
interface
IMessageData
{
role
:
string
;
content
:
any
[];
created_at
?:
number
;
assistant_id
?:
string
,
}
export
interface
IThreadAndMessageAndAssistant
{
thread
:
IThread
;
first_message
:
IMessage
;
assistant
:
IAssistantWithStatus
}
export
interface
IDeleteResult
{
id
:
string
;
object
:
string
;
deleted
:
boolean
;
}
export
interface
IBuildData
{
parsed_file_count
:
number
;
total_file_count
:
number
;
prefilling_current
:
number
;
prefilling_total
:
number
;
build_completed_time
:
number
;
build_started_time
:
number
;
storage_total
:
number
;
storage_usage
:
number
;
status
:
string
}
\ No newline at end of file
ktransformers/website/src/views/home.vue
0 → 100644
View file @
18c42e67
<
template
>
<div
class=
"home flex-row"
>
<nav
class=
"left-panel flex-column"
>
<div
class=
"logo-box"
>
<div
class=
"logo flex-row"
>
<img
class=
"img"
src=
"../../public/images/three.png"
/>
<span
class=
"text"
>
{{
projectName
}}
</span>
</div>
<div
class=
"version"
>
{{
projectVersion
}}
</div>
</div>
<div
class=
"divider"
></div>
<div
class=
"assistant-box"
>
<div
class=
"assistant-list"
>
<ul>
<li
class=
"assistant-item flex-row"
v-for=
"(item, index) in assistantList"
:key=
"index"
@
click=
"setActiveAssistant(item)"
>
<img
src=
"../../public/images/avatar.png"
/>
<span
class=
"name flex-unit"
>
{{
item
.
name
}}
</span>
<i
class=
"iconfont icon-edit"
></i>
</li>
</ul>
</div>
</div>
<div
class=
"divider"
></div>
<!-- History area -->
<div
class=
"history-box flex-unit"
>
<div
class=
""
>
<div
class=
"date"
>
{{
$t
(
"
home.today
"
)
}}
</div>
<ul>
<li
v-for=
"(item, index) in todayThreads"
:key=
"index"
class=
"chat-item"
:class=
"
{ active: activeThreadIndex === index }"
@click="setActiveThreadIndex(index)"
>
<div
class=
"chat-abbr"
>
{{
firstMessages
[
index
]
}}
</div>
<div
class=
"chat-ops flex-row"
>
<img
src=
"../../public/images/avatar.png"
/>
<div
class=
"name flex-unit"
>
{{
assistantOfThread
[
index
].
name
||
""
}}
</div>
<i
class=
"iconfont icon-delete"
@
click=
"delThread(index)"
></i>
</div>
</li>
</ul>
<div
class=
"date"
v-if=
"previousThreads.length > 0"
>
{{
$t
(
"
home.previous
"
)
}}
</div>
<ul>
<li
v-for=
"(item, index) in previousThreads"
:key=
"index"
class=
"chat-item"
:class=
"
{
active: activeThreadIndex === index + todayThreads.length,
}"
@click="setActiveThreadIndex(index + todayThreads.length)"
>
<div
class=
"chat-abbr"
>
{{
firstMessages
[
index
+
todayThreads
.
length
]
}}
</div>
<div
class=
"chat-ops flex-row"
>
<img
src=
"../../public/images/avatar.png"
/>
<div
class=
"name flex-unit"
>
{{
assistantOfThread
[
index
+
todayThreads
.
length
].
name
||
""
}}
</div>
<i
class=
"iconfont icon-delete"
@
click=
"delThread(index + todayThreads.length)"
></i>
</div>
</li>
</ul>
</div>
</div>
<div
class=
"icon-box example-2"
>
<div
class=
"iconhub icon-content"
@
click=
"navigateToIconHub"
>
<svg
xmlns=
"http://www.w3.org/2000/svg"
width=
"16"
height=
"16"
fill=
"currentColor"
class=
"bi bi-github"
viewBox=
"0 0 16 16"
xml:space=
"preserve"
>
<path
d=
"M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27s1.36.09 2 .27c1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.01 8.01 0 0 0 16 8c0-4.42-3.58-8-8-8"
fill=
"currentColor"
></path>
</svg>
<div
class=
"tooltip"
>
GitHub
</div>
</div>
<div
class=
"iconlanguage"
@
click=
"changeLanguage"
>
<svg
v-if=
"!flag"
t=
"1719306572024"
class=
"icon"
viewBox=
"0 0 1024 1024"
version=
"1.1"
xmlns=
"http://www.w3.org/2000/svg"
p-id=
"16849"
data-spm-anchor-id=
"a313x.search_index.0.i21.366e3a81tz0TYS"
width=
"18"
height=
"18"
>
<path
d=
"M64.064 768V192H448.64v64H127.936v192h320v64h-320v192h320v64H64.064z m511.872 0V192h64l256 447.68V192h64v576h-64l-256-447.168V768h-64z"
p-id=
"16850"
data-spm-anchor-id=
"a313x.search_index.0.i22.366e3a81tz0TYS"
class=
"selected"
fill=
"#000000"
></path>
</svg>
<svg
v-else
t=
"1719306494614"
class=
"icon"
viewBox=
"0 0 1024 1024"
version=
"1.1"
xmlns=
"http://www.w3.org/2000/svg"
p-id=
"12325"
width=
"18"
height=
"18"
>
<path
d=
"M1023.488 831.552h-96l-265.472-451.904c-8.96-12.8-16-25.344-21.44-37.888H638.08c2.176 12.992 3.2 40.128 3.2 81.408v408.32L576 836.928V256h101.568l257.024 445.632c14.592 20.992 23.232 34.368 25.92 40.128h1.6c-2.688-16.512-4.032-44.8-4.032-84.736v-399.36L1024 256l-0.512 575.552zM435.008 804.224c-42.752 21.76-96.384 32.64-160.896 32.64-83.2 0-149.76-25.6-199.488-76.736C24.896 708.928 0 641.344 0 557.12c0-90.432 27.968-163.2 84.032-218.368C140.032 283.52 211.072 256 297.344 256c55.552 0 101.376 7.616 137.6 22.848v75.84a284.992 284.992 0 0 0-136.832-33.408c-64.768 0-117.504 20.864-158.208 62.592-40.768 41.728-61.184 98.048-61.184 168.96 0 67.2 19.008 120.576 57.024 160.128 38.016 39.552 87.744 59.328 149.248 59.328 57.536 0 107.52-12.544 150.016-37.76v69.696z"
fill=
"#000000"
p-id=
"12326"
data-spm-anchor-id=
"a313x.search_index.0.i16.366e3a81tz0TYS"
class=
"selected"
></path>
</svg>
</div>
</div>
</nav>
<router-view
v-slot=
"
{ Component }" class="main-panel flex-unit">
<component
:is=
"Component"
:chatInit=
"chatInit"
:activeAssistant=
"activeAssistant"
:activeThread=
"activeThread"
:messages=
"allMessageInCurrentThread"
:completedAssistant=
"assistantList"
:inputDisabled=
"inputDisabled"
@
updateAssistant=
"handleUpdateAssistant"
/>
</router-view>
</div>
</
template
>
<
script
lang=
"ts"
>
import
{
defineComponent
,
ref
,
onMounted
,
computed
,
nextTick
}
from
"
vue
"
;
import
{
IThread
,
IAssistant
,
IMessageData
,
IThreadAndMessageAndAssistant
,
IAssistantWithStatus
,
}
from
"
@/utils/types
"
;
import
{
listThreads
,
deleteThread
,
getThread
}
from
"
@/api/thread
"
;
import
{
ElMessage
,
ElMessageBox
}
from
"
element-plus
"
;
import
{
listAssistants
}
from
"
@/api/assistant
"
;
import
{
listMessages
}
from
"
@/api/message
"
;
import
{
useRouter
}
from
"
vue-router
"
;
import
BScroll
from
"
better-scroll
"
;
import
{
useI18n
}
from
"
vue-i18n
"
;
export
default
defineComponent
({
name
:
"
HomeView
"
,
setup
()
{
const
assistantList
=
ref
<
IAssistant
[]
>
([]);
const
threadsList
=
ref
<
IThread
[]
>
([]);
const
firstMessages
=
ref
<
string
[]
>
([]);
const
activeAssistant
=
ref
({}
as
IAssistant
);
const
assistantOfThread
=
ref
<
IAssistantWithStatus
[]
>
([]);
const
threadAndMessages
=
ref
<
IThreadAndMessageAndAssistant
[]
>
([]);
const
assistantScroll
=
ref
<
BScroll
|
null
>
(
null
);
const
historyScroll
=
ref
<
BScroll
|
null
>
(
null
);
const
router
=
useRouter
();
const
{
t
,
locale
}
=
useI18n
();
const
flag
=
ref
(
true
);
const
changeLanguage
=
()
=>
{
if
(
flag
.
value
)
{
locale
.
value
=
"
zh
"
;
localStorage
.
setItem
(
"
lang
"
,
"
zh
"
);
flag
.
value
=
false
;
}
else
{
locale
.
value
=
"
en
"
;
flag
.
value
=
true
;
localStorage
.
setItem
(
"
lang
"
,
"
en
"
);
}
};
// Initialize data
const
initData
=
async
()
=>
{
try
{
threadsList
.
value
=
[];
firstMessages
.
value
=
[];
assistantOfThread
.
value
=
[];
const
assistantsRes
=
await
listAssistants
();
if
(
assistantsRes
&&
assistantsRes
.
length
>
0
)
{
assistantList
.
value
=
assistantsRes
;
activeAssistant
.
value
=
assistantsRes
[
0
];
}
const
threadsRes
=
await
listThreads
(
100
);
if
(
threadsRes
)
{
threadAndMessages
.
value
=
threadsRes
;
for
(
let
t
of
threadsRes
)
{
if
(
t
.
thread
&&
!
t
.
thread
.
metadata
?.
hidden
)
{
threadsList
.
value
.
push
(
t
.
thread
);
if
(
t
.
first_message
&&
t
.
first_message
.
content
&&
t
.
first_message
.
content
.
length
>
0
)
{
firstMessages
.
value
.
push
(
t
.
first_message
.
content
[
0
].
text
.
value
);
}
else
{
firstMessages
.
value
.
push
(
"
no message yet
"
);
}
assistantOfThread
.
value
.
push
(
t
.
assistant
||
({}
as
IAssistantWithStatus
)
);
}
}
}
assistantScroll
.
value
=
new
BScroll
(
"
.assistant-list
"
,
{
click
:
true
,
mouseWheel
:
true
,
scrollbar
:
{
fade
:
true
,
interactive
:
true
,
},
});
historyScroll
.
value
=
new
BScroll
(
"
.history-box
"
,
{
click
:
true
,
mouseWheel
:
true
,
scrollbar
:
{
fade
:
true
,
interactive
:
true
,
},
});
}
catch
(
err
)
{
console
.
error
(
"
Failed to initialize data:
"
,
err
);
}
};
const
navigateToIconHub
=
()
=>
{
window
.
open
(
"
https://github.com/kvcache-ai/Lexllama
"
);
};
const
isEmptyObject
=
(
obj
:
object
):
boolean
=>
{
//Determine if the object is empty
return
Object
.
keys
(
obj
).
length
===
0
;
};
//Jump route
const
navigateToExplore
=
()
=>
{
router
.
push
(
"
/explore
"
);
};
const
navigatorToChat
=
()
=>
{
router
.
push
(
"
/chat
"
);
};
// Calculate date
const
todayThreads
=
computed
(()
=>
{
const
today
=
Math
.
floor
(
Date
.
now
()
/
1000
);
return
threadsList
.
value
.
filter
((
thread
)
=>
{
return
today
-
thread
.
created_at
<=
86400
;
});
});
const
previousThreads
=
computed
(()
=>
{
const
today
=
Math
.
floor
(
Date
.
now
()
/
1000
);
return
threadsList
.
value
.
filter
((
thread
)
=>
{
return
today
-
thread
.
created_at
>
86400
;
});
});
onMounted
(
async
()
=>
{
initData
();
});
return
{
t
,
flag
,
assistantList
,
isEmptyObject
,
activeAssistant
,
navigateToExplore
,
navigatorToChat
,
threadsList
,
firstMessages
,
navigateToIconHub
,
assistantScroll
,
historyScroll
,
assistantOfThread
,
changeLanguage
,
initData
,
todayThreads
,
previousThreads
,
};
},
data
()
{
return
{
projectName
:
"
KTransformers
"
,
projectVersion
:
"
v0.01
"
,
activeThreadIndex
:
-
1
,
chatInit
:
true
,
activeThread
:
{}
as
IThread
,
allMessageInCurrentThread
:
[]
as
IMessageData
[],
inputDisabled
:
false
,
isSettingActiveThread
:
false
,
isDeletingThread
:
false
,
threadAndMessages
:
<
IThreadAndMessageAndAssistant
[]
>
[],
};
},
methods
:
{
setActiveAssistant
(
assistant
:
IAssistant
)
{
this
.
chatInit
=
true
;
this
.
inputDisabled
=
false
;
this
.
activeThreadIndex
=
-
1
;
this
.
activeAssistant
=
assistant
;
this
.
activeThread
=
{}
as
IThread
;
this
.
allMessageInCurrentThread
=
[];
if
(
this
.
$route
.
path
!=
"
/chat
"
)
{
this
.
navigatorToChat
();
}
},
async
setActiveThreadIndex
(
index
:
number
)
{
//If setting up an active thread, return directly
if
(
this
.
isSettingActiveThread
)
{
return
;
}
this
.
isSettingActiveThread
=
true
;
this
.
activeThreadIndex
=
index
;
this
.
chatInit
=
false
;
this
.
inputDisabled
=
false
;
this
.
activeAssistant
=
{}
as
IAssistant
;
this
.
activeThread
=
this
.
threadsList
[
index
];
//If the assistant of the current thread is an empty object
if
(
this
.
isEmptyObject
(
this
.
assistantOfThread
[
index
]))
{
ElMessage
({
message
:
this
.
t
(
"
home.withoutAssistantTip
"
),
type
:
"
warning
"
,
});
this
.
inputDisabled
=
true
;
}
try
{
//Call asynchronous function to obtain the message list of the current thread
const
res
=
await
listMessages
(
this
.
activeThread
.
id
,
100
,
"
asc
"
);
//Convert the obtained message list to the specified format and assign values to all messages of the current thread
this
.
allMessageInCurrentThread
=
res
.
map
((
m
)
=>
({
role
:
m
.
role
,
content
:
m
.
content
,
assistant_id
:
m
.
assistant_id
,
created_at
:
m
.
created_at
,
}));
}
catch
(
err
)
{
console
.
log
(
err
);
}
finally
{
this
.
isSettingActiveThread
=
false
;
}
if
(
this
.
$route
.
path
!=
"
/chat
"
)
{
this
.
navigatorToChat
();
}
},
async
delThread
(
index
:
number
)
{
// If the thread is currently being deleted, return directly
if
(
this
.
isDeletingThread
)
{
return
;
}
this
.
isDeletingThread
=
true
;
try
{
//Pop up a confirmation box and ask the user if they are sure to delete the thread
await
ElMessageBox
.
confirm
(
this
.
t
(
"
home.deleteThreadTip
"
),
"
Warning
"
,
{
confirmButtonText
:
"
OK
"
,
cancelButtonText
:
"
Cancel
"
,
type
:
"
warning
"
,
});
const
res
=
await
deleteThread
(
this
.
threadsList
[
index
].
id
);
this
.
threadsList
.
splice
(
index
,
1
);
this
.
firstMessages
.
splice
(
index
,
1
);
this
.
assistantOfThread
.
splice
(
index
,
1
);
// Jump to the first assistant or other suitable page
this
.
setActiveAssistant
(
this
.
assistantList
[
0
]);
ElMessage
({
type
:
"
success
"
,
message
:
"
Delete completed
"
,
});
}
catch
(
err
)
{
// Specific error handling, such as logging or displaying specific error messages to users
console
.
error
(
"
Delete session failed:
"
,
err
);
ElMessage
({
type
:
"
error
"
,
message
:
`Delete failed`
,
// Display specific error messages
});
}
finally
{
this
.
isDeletingThread
=
false
;
//Ensure that the delete thread flag is reset no matter what
}
},
// Handles the update of the assistant asynchronously.
async
handleUpdateAssistant
(
value
:
any
)
{
await
this
.
initData
();
if
(
this
.
activeThreadIndex
!=
-
1
)
{
this
.
setActiveThreadIndex
(
this
.
activeThreadIndex
);
}
else
if
(
this
.
activeAssistant
.
id
)
{
this
.
setActiveThreadIndex
(
0
);
}
else
{
this
.
setActiveAssistant
(
this
.
assistantList
[
0
]);
}
},
},
});
</
script
>
<
style
lang=
"stylus"
rel=
"stylesheet/stylus"
scoped
>
@import '../assets/css/mixins.styl';
.home {
width: 100%;
height: 100%;
position: relative;
}
.left-panel {
width: 320px;
height: 100%;
background-color: #363433;
padding: 30px 30px;
.logo-box {
.logo {
.img {
width: 36px;
height: 36px;
}
.text {
font-size: 28px;
font-weight: bold;
margin-left: 10px;
color: #edf2ea;
}
}
.version {
text-align: right;
font-size: 14px;
color: #bdbdbd;
}
}
.divider {
border-bottom: 1px solid #D7D7D7;
width: 30%;
margin: 30px auto;
}
.lang-box {
position: relative;
width: 100%;
height: 30px;
margin: auto;
margin-bottom: 10px;
.el-dropdown {
font-size: 14px;
position: absolute;
top: 50%;
left: 50%;
transform: translate(-50%, -50%);
}
}
.assistant-box {
.assistant-list {
min-height: 50px;
max-height: 300px;
overflow: hidden;
position: relative;
ul > li.assistant-item {
padding: 8px 15px;
color: #edf2ea;
img {
width: 32px;
height: 32px;
}
.name {
margin-left: 12px;
font-size: 14px;
color: #edf2ea;
}
i.iconfont {
display: none;
margin-left: 10px;
}
&:hover {
background-color: $bg_gray_light_hover;
cursor: pointer;
border-radius: 4px;
.name {
color: #313433;
}
i.iconfont {
display: block;
}
}
}
}
.explore {
position: relative;
justify-content: center;
display: flex;
margin-top: 10px;
.explore-btn {
margin: 0 auto;
padding: 0 20px;
justify-content: center;
height: 32px;
line-height: 32px;
background-color: #FFFFFF;
border: 1px solid RGBA(0, 0, 0, 0.15);
border-radius: 16px;
i {
color: #8080FF;
}
.text {
color: #7F7F7F;
margin-left: 4px;
}
&:hover {
background-color: #FAFAFA;
cursor: pointer;
}
}
}
}
.history-box {
position: relative;
.date {
font-size: 14px;
color: #7F7F7F;
margin: 8px 0;
&:first-child {
margin-top: 0;
}
}
li.chat-item {
padding: 12px 15px;
cursor: pointer;
background-color: #edf2ea;
border-radius: 4px;
margin-bottom: 10px;
font-size: 16px;
.chat-abbr {
font-size: 14px;
color: #313433;
white-space: nowrap;
overflow: hidden;
text-overflow: ellipsis;
}
.chat-ops {
display: flex;
margin-top: 5px;
img {
width: 16px;
height: 16px;
}
.name {
font-size: 12px;
color: #898989;
margin-left: 8px;
}
i.iconfont {
color: $gray_60;
}
}
&:hover, &.active {
transition: 0.3s all;
cursor: pointer;
background-color: #a2a79f;
.chat-abbr {
color: black;
}
.name, i.iconfont {
color: black;
}
}
}
}
.icon-box {
width: 100%;
display: flex;
flex-direction: row;
justify-content: flex-end;
align-items: center;
.iconhub {
width: 32px;
height: 24px;
background: white;
font-size: 30px;
border: none;
ovferflow: hidden;
border-radius: 15%;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
color: #898989;
transition: all 0.5s;
cursor: pointer;
}
.iconhub:hover {
background: #e5e5e5;
text-decoration: none;
}
.iconlanguage {
margin-left: 15px;
width: 32px;
height: 24px;
background: white;
font-size: 30px;
border: none;
ovferflow: hidden;
border-radius: 15%;
display: flex;
flex-direction: column;
justify-content: center;
align-items: center;
color: #898989;
transition: all 0.5s;
cursor: pointer;
}
.iconlanguage:hover {
background: #e5e5e5;
text-decoration: none;
}
}
}
ul {
list-style: none;
}
.example-2 {
display: flex;
justify-content: center;
align-items: center;
}
.example-2 .icon-content {
margin: 0 10px;
position: relative;
}
.example-2 .icon-content .tooltip {
position: absolute;
top: -30px;
left: 50%;
transform: translateX(-50%);
color: #fff;
padding: 6px 10px;
border-radius: 5px;
opacity: 0;
visibility: hidden;
font-size: 14px;
transition: all 0.3s ease;
}
.example-2 .icon-content:hover .tooltip {
opacity: 1;
visibility: visible;
top: -50px;
}
.main-panel {
height: 100%;
background-color: #f1f0ed;
}
</
style
>
ktransformers/website/tests/unit/example.spec.ts
0 → 100644
View file @
18c42e67
import
{
shallowMount
}
from
'
@vue/test-utils
'
import
HelloWorld
from
'
@/components/HelloWorld.vue
'
describe
(
'
HelloWorld.vue
'
,
()
=>
{
it
(
'
renders props.msg when passed
'
,
()
=>
{
const
msg
=
'
new message
'
const
wrapper
=
shallowMount
(
HelloWorld
,
{
props
:
{
msg
}
})
expect
(
wrapper
.
text
()).
toMatch
(
msg
)
})
})
ktransformers/website/tsconfig.json
0 → 100644
View file @
18c42e67
{
"compilerOptions"
:
{
"target"
:
"es5"
,
"module"
:
"esnext"
,
"strict"
:
true
,
"jsx"
:
"preserve"
,
"importHelpers"
:
true
,
"moduleResolution"
:
"node"
,
"skipLibCheck"
:
true
,
"esModuleInterop"
:
true
,
"allowSyntheticDefaultImports"
:
true
,
"forceConsistentCasingInFileNames"
:
true
,
"useDefineForClassFields"
:
true
,
"sourceMap"
:
true
,
"allowJs"
:
true
,
"baseUrl"
:
"."
,
"types"
:
[
"webpack-env"
,
"jest"
],
"paths"
:
{
"@/*"
:
[
"src/*"
]
},
"lib"
:
[
"esnext"
,
"dom"
,
"dom.iterable"
,
"scripthost"
]
},
"include"
:
[
"src/**/*.ts"
,
"src/**/*.tsx"
,
"src/**/*.vue"
,
"tests/**/*.ts"
,
"tests/**/*.tsx"
,
"config.d.ts"
],
"exclude"
:
[
"node_modules"
]
}
\ No newline at end of file
ktransformers/website/vue.config.js
0 → 100644
View file @
18c42e67
module
.
exports
=
{
// 配置 webpack-dev-server 行为。
devServer
:
{
open
:
false
,
// 编译后默认打开浏览器
host
:
'
0.0.0.0
'
,
// 域名
port
:
8082
,
// 端口
https
:
false
,
// 是否https
proxy
:
{
'
/api
'
:
{
target
:
'
http://localhost:9016/v1
'
,
// 你的后端服务器地址
changeOrigin
:
true
,
// 是否允许跨域
pathRewrite
:
{
'
/api
'
:
''
// 将 '/api' 前缀替换为空,如果你的后端不需要这个前缀
}
}
}
},
publicPath
:
'
/web/
'
,
// 基本路径
outputDir
:
'
dist
'
,
// 构建时的输出目录
assetsDir
:
'
static
'
,
// 放置静态资源的目录
indexPath
:
'
index.html
'
,
// html 的输出路径
filenameHashing
:
true
,
// 文件名哈希值
lintOnSave
:
false
,
// 是否在保存的时候使用 `eslint-loader` 进行检查。
// 组件是如何被渲染到页面中的? (ast:抽象语法树;vDom:虚拟DOM)
// template ---> ast ---> render ---> vDom ---> 真实的Dom ---> 页面
// runtime-only:将template在打包的时候,就已经编译为render函数
// runtime-compiler:在运行的时候才去编译template
runtimeCompiler
:
false
,
transpileDependencies
:
[],
// babel-loader 默认会跳过 node_modules 依赖。
productionSourceMap
:
false
,
// 是否为生产环境构建生成 source map
//调整内部的 webpack 配置
configureWebpack
:
()
=>
{},
chainWebpack
:
()
=>
{},
}
\ No newline at end of file
pyproject.toml
0 → 100644
View file @
18c42e67
[build-system]
requires
=
[
"setuptools"
,
"torch == 2.3.1"
,
"ninja"
,
"packaging"
]
build-backend
=
"setuptools.build_meta"
requirements-local_chat.txt
0 → 100644
View file @
18c42e67
fire
transformers
numpy
torch>=2.3.0
packaging
setup.py
0 → 100644
View file @
18c42e67
#!/usr/bin/env python
# coding=utf-8
'''
Description :
Author : chenxl
Date : 2024-07-12 07:25:42
Version : 1.0.0
LastEditors : chenxl
LastEditTime : 2024-07-27 04:31:03
'''
import
os
import
shutil
import
sys
import
re
import
ast
import
subprocess
import
platform
import
io
from
pathlib
import
Path
from
packaging.version
import
parse
import
torch.version
from
wheel.bdist_wheel
import
bdist_wheel
as
_bdist_wheel
from
setuptools
import
setup
,
Extension
import
torch
from
torch.utils.cpp_extension
import
BuildExtension
,
CUDAExtension
,
CUDA_HOME
ROOT_DIR
=
os
.
path
.
dirname
(
__file__
)
class
VersionInfo
:
THIS_DIR
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
PACKAGE_NAME
=
"ktransformers"
def
get_cuda_bare_metal_version
(
self
,
cuda_dir
):
raw_output
=
subprocess
.
check_output
([
cuda_dir
+
"/bin/nvcc"
,
"-V"
],
universal_newlines
=
True
)
output
=
raw_output
.
split
()
release_idx
=
output
.
index
(
"release"
)
+
1
bare_metal_version
=
parse
(
output
[
release_idx
].
split
(
","
)[
0
])
cuda_version
=
f
"
{
bare_metal_version
.
major
}{
bare_metal_version
.
minor
}
"
return
cuda_version
def
get_cuda_version_of_torch
(
self
,):
torch_cuda_version
=
parse
(
torch
.
version
.
cuda
)
cuda_version
=
f
"
{
torch_cuda_version
.
major
}{
torch_cuda_version
.
minor
}
"
return
cuda_version
def
get_platform
(
self
,):
"""
Returns the platform name as used in wheel filenames.
"""
if
sys
.
platform
.
startswith
(
"linux"
):
return
f
'linux_
{
platform
.
uname
().
machine
}
'
else
:
raise
ValueError
(
"Unsupported platform: {}"
.
format
(
sys
.
platform
))
def
get_cpu_instruct
(
self
,):
if
sys
.
platform
.
startswith
(
"linux"
):
with
open
(
'/proc/cpuinfo'
,
'r'
)
as
cpu_f
:
cpuinfo
=
cpu_f
.
read
()
flags_line
=
[
line
for
line
in
cpuinfo
.
split
(
'
\n
'
)
if
line
.
startswith
(
'flags'
)][
0
]
flags
=
flags_line
.
split
(
':'
)[
1
].
strip
().
split
(
' '
)
for
flag
in
flags
:
if
'avx512'
in
flag
:
return
'avx512'
for
flag
in
flags
:
if
'avx2'
in
flag
:
return
'avx2'
raise
ValueError
(
"Unsupported cpu Instructions: {}"
.
format
(
flags_line
))
def
get_torch_version
(
self
,):
torch_version_raw
=
parse
(
torch
.
__version__
)
torch_version
=
f
"
{
torch_version_raw
.
major
}{
torch_version_raw
.
minor
}
"
return
torch_version
def
get_package_version
(
self
,):
version_file
=
os
.
path
.
join
(
Path
(
VersionInfo
.
THIS_DIR
),
VersionInfo
.
PACKAGE_NAME
,
"__init__.py"
)
with
open
(
version_file
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
version_match
=
re
.
search
(
r
"^__version__\s*=\s*(.*)$"
,
f
.
read
(),
re
.
MULTILINE
)
public_version
=
ast
.
literal_eval
(
version_match
.
group
(
1
))
package_version
=
f
"
{
str
(
public_version
)
}
+cu
{
self
.
get_cuda_bare_metal_version
(
CUDA_HOME
)
}
torch
{
self
.
get_torch_version
()
}{
self
.
get_cpu_instruct
()
}
"
return
package_version
class
BuildWheelsCommand
(
_bdist_wheel
):
def
get_wheel_name
(
self
,):
version_info
=
VersionInfo
()
python_version
=
f
"cp
{
sys
.
version_info
.
major
}{
sys
.
version_info
.
minor
}
"
wheel_filename
=
f
"
{
VersionInfo
.
PACKAGE_NAME
}
-
{
version_info
.
get_package_version
()
}
-
{
python_version
}
-
{
python_version
}
-
{
version_info
.
get_platform
()
}
.whl"
return
wheel_filename
def
run
(
self
):
super
().
run
()
impl_tag
,
abi_tag
,
plat_tag
=
self
.
get_tag
()
archive_basename
=
f
"
{
self
.
wheel_dist_name
}
-
{
impl_tag
}
-
{
abi_tag
}
-
{
plat_tag
}
"
wheel_path
=
os
.
path
.
join
(
self
.
dist_dir
,
archive_basename
+
".whl"
)
wheel_name_with_platform
=
os
.
path
.
join
(
self
.
dist_dir
,
self
.
get_wheel_name
())
os
.
rename
(
wheel_path
,
wheel_name_with_platform
)
# Convert distutils Windows platform specifiers to CMake -A arguments
PLAT_TO_CMAKE
=
{
"win32"
:
"Win32"
,
"win-amd64"
:
"x64"
,
"win-arm32"
:
"ARM"
,
"win-arm64"
:
"ARM64"
,
}
class
CopyExtension
(
Extension
):
def
__init__
(
self
,
name
:
str
,
sourcedir
:
str
=
""
,
copy_file_source
=
""
)
->
None
:
super
().
__init__
(
name
,
sources
=
[])
self
.
sourcedir
=
os
.
fspath
(
Path
(
sourcedir
).
resolve
())
self
.
source_file
=
copy_file_source
class
CMakeExtension
(
Extension
):
def
__init__
(
self
,
name
:
str
,
sourcedir
:
str
=
""
)
->
None
:
super
().
__init__
(
name
,
sources
=
[])
self
.
sourcedir
=
os
.
fspath
(
Path
(
sourcedir
).
resolve
()
/
"ktransformers/ktransformers_ext"
)
class
CMakeBuild
(
BuildExtension
):
def
build_extension
(
self
,
ext
)
->
None
:
if
isinstance
(
ext
,
CopyExtension
):
ext_fullpath
=
Path
.
cwd
()
/
self
.
get_ext_fullpath
(
ext
.
name
)
extdir
=
ext_fullpath
.
parent
.
resolve
()
shutil
.
copy
(
ext
.
source_file
,
extdir
)
return
if
not
isinstance
(
ext
,
CMakeExtension
):
super
().
build_extension
(
ext
)
return
ext_fullpath
=
Path
.
cwd
()
/
self
.
get_ext_fullpath
(
ext
.
name
)
extdir
=
ext_fullpath
.
parent
.
resolve
()
# Using this requires trailing slash for auto-detection & inclusion of
# auxiliary "native" libs
debug
=
int
(
os
.
environ
.
get
(
"DEBUG"
,
0
))
if
self
.
debug
is
None
else
self
.
debug
cfg
=
"Debug"
if
debug
else
"Release"
# CMake lets you override the generator - we need to check this.
# Can be set with Conda-Build, for example.
cmake_generator
=
os
.
environ
.
get
(
"CMAKE_GENERATOR"
,
""
)
# Set Python_EXECUTABLE instead if you use PYBIND11_FINDPYTHON
# EXAMPLE_VERSION_INFO shows you how to pass a value into the C++ code
# from Python.
cmake_args
=
[
f
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=
{
extdir
}{
os
.
sep
}
"
,
f
"-DPYTHON_EXECUTABLE=
{
sys
.
executable
}
"
,
f
"-DCMAKE_BUILD_TYPE=
{
cfg
}
"
,
# not used on MSVC, but no harm
]
build_args
=
[]
if
"CMAKE_ARGS"
in
os
.
environ
:
cmake_args
+=
[
item
for
item
in
os
.
environ
[
"CMAKE_ARGS"
].
split
(
" "
)
if
item
]
# In this example, we pass in the version to C++. You might not need to.
cmake_args
+=
[
f
"-DEXAMPLE_VERSION_INFO=
{
self
.
distribution
.
get_version
()
}
"
]
if
self
.
compiler
.
compiler_type
!=
"msvc"
:
if
not
cmake_generator
or
cmake_generator
==
"Ninja"
:
try
:
import
ninja
ninja_executable_path
=
Path
(
ninja
.
BIN_DIR
)
/
"ninja"
cmake_args
+=
[
"-GNinja"
,
f
"-DCMAKE_MAKE_PROGRAM:FILEPATH=
{
ninja_executable_path
}
"
,
]
except
ImportError
:
pass
else
:
# Single config generators are handled "normally"
single_config
=
any
(
x
in
cmake_generator
for
x
in
{
"NMake"
,
"Ninja"
})
# CMake allows an arch-in-generator style for backward compatibility
contains_arch
=
any
(
x
in
cmake_generator
for
x
in
{
"ARM"
,
"Win64"
})
if
not
single_config
and
not
contains_arch
:
cmake_args
+=
[
"-A"
,
PLAT_TO_CMAKE
[
self
.
plat_name
]]
# Multi-config generators have a different way to specify configs
if
not
single_config
:
cmake_args
+=
[
f
"-DCMAKE_LIBRARY_OUTPUT_DIRECTORY_
{
cfg
.
upper
()
}
=
{
extdir
}
"
]
build_args
+=
[
"--config"
,
cfg
]
if
sys
.
platform
.
startswith
(
"darwin"
):
# Cross-compile support for macOS - respect ARCHFLAGS if set
archs
=
re
.
findall
(
r
"-arch (\S+)"
,
os
.
environ
.
get
(
"ARCHFLAGS"
,
""
))
if
archs
:
cmake_args
+=
[
"-DCMAKE_OSX_ARCHITECTURES={}"
.
format
(
";"
.
join
(
archs
))]
if
"CMAKE_BUILD_PARALLEL_LEVEL"
not
in
os
.
environ
:
if
hasattr
(
self
,
"parallel"
)
and
self
.
parallel
:
build_args
+=
[
f
"-j
{
self
.
parallel
}
"
]
build_temp
=
Path
(
ext
.
sourcedir
)
/
"build"
if
not
build_temp
.
exists
():
build_temp
.
mkdir
(
parents
=
True
)
subprocess
.
run
(
[
"cmake"
,
ext
.
sourcedir
,
*
cmake_args
],
cwd
=
build_temp
,
check
=
True
)
subprocess
.
run
(
[
"cmake"
,
"--build"
,
"."
,
*
build_args
],
cwd
=
build_temp
,
check
=
True
)
def
read_readme
()
->
str
:
p
=
os
.
path
.
join
(
ROOT_DIR
,
"README.md"
)
if
os
.
path
.
isfile
(
p
):
return
io
.
open
(
p
,
"r"
,
encoding
=
"utf-8"
).
read
()
else
:
return
""
setup
(
name
=
"ktransformers"
,
version
=
VersionInfo
().
get_package_version
(),
author
=
"KVCache.ai"
,
license
=
"Apache 2.0"
,
description
=
"KTransformers, pronounced as Quick Transformers, is designed to enhance your Transformers experience with advanced kernel optimizations and placement/parallelism strategies."
,
long_description
=
read_readme
(),
long_description_content_type
=
"text/markdown"
,
cmdclass
=
{
"build_ext"
:
CMakeBuild
},
install_requires
=
[
"torch >= 2.3.0"
,
"transformers == 4.43.2"
,
"fastapi >= 0.111.0"
,
"langchain >= 0.2.0"
,
"blessed >= 1.20.0"
,
"accelerate >= 0.31.0"
,
"sentencepiece >= 0.1.97"
,
"setuptools"
,
"ninja"
,
"wheel"
,
"colorlog"
,
"build"
,
"packaging"
,
"fire"
],
python_requires
=
">=3.10"
,
entry_points
=
{
"console_scripts"
:
[
"ktransformers=ktransformers.server.main:main"
,
],
},
packages
=
[
"ktransformers"
],
include_package_data
=
True
,
ext_modules
=
[
CUDAExtension
(
'KTransformersOps'
,
[
'ktransformers/ktransformers_ext/cuda/custom_gguf/dequant.cu'
,
'ktransformers/ktransformers_ext/cuda/binding.cpp'
,
'ktransformers/ktransformers_ext/cuda/gptq_marlin/gptq_marlin.cu'
,
]),
CMakeExtension
(
"cpuinfer_ext"
)]
)
\ No newline at end of file
llama.cpp
@
a94e6ff8
Subproject commit a94e6ff8774b7c9f950d9545baf0ce35e8d1ed2f
third_party/llamafile/README.md
0 → 100644
View file @
18c42e67
The code in this folder is copied from
[
Mozilla-Ocho/llamafile
](
https://github.com/Mozilla-Ocho/llamafile
)
. Special thanks to the Mozilla-Ocho team.
third_party/llamafile/bench.h
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/bench.h
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp ts=4 sts=4 sw=4 fenc=utf-8 :vi
#pragma once
#include <stdio.h>
#include "micros.h"
#define BENCH(x) \
do { \
x; \
__asm__ volatile("" ::: "memory"); \
long long start = micros(); \
for (int i = 0; i < ITERATIONS; ++i) { \
__asm__ volatile("" ::: "memory"); \
x; \
__asm__ volatile("" ::: "memory"); \
} \
printf("%9lld us %s\n", (micros() - start + ITERATIONS - 1) / ITERATIONS, #x); \
} while (0)
third_party/llamafile/flags.cpp
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#include "flags.h"
bool
FLAG_precise
=
false
;
third_party/llamafile/flags.h
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/flags.cpp
// Copyrigth 2024 Mozilla Foundation.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#pragma once
extern
bool
FLAG_precise
;
third_party/llamafile/iqk_mul_mat.inc
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat.inc
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
// -*- mode:c++;indent-tabs-mode:nil;c-basic-offset:4;coding:utf-8 -*-
// vi: set et ft=cpp fenc=utf-8 :vi
//
// Copyright 2024 Iwan Kawrakow
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <cstring>
#include <type_traits>
#if defined __x86_64__ || defined __aarch64__
#include "llama.cpp/ggml-impl.h"
#include "llama.cpp/ggml-quants.h"
#include "sgemm.h"
// For i-quants, I had to explicitely specify which
// functions to inline / not inline (at least for some
// of the functions), else performance would be significantly
// lower. This is worrysome as things can change with,
// e.g., a different compiler version or running on a different
// CPU.
#ifdef _MSC_VER
#define IQK_NOINLINE __declspec(noinline)
#define IQK_ALWAYS_INLINE inline
#else
#define IQK_NOINLINE __attribute__((__noinline__))
#define IQK_ALWAYS_INLINE __attribute__((always_inline))
#endif
#define GGML_COMMON_IMPL_C
#include "llama.cpp/ggml-common.h"
// clang-format off
// This matrix - vector and matrix - matrix multiplication implementation
// for legacy quants, k-quants and i-quants makes prompt processing 150-200%
// (legacy and k-quants) or 250-400% (i-quants) faster.
// compared to mainline llama.cpp (and llamafile).
// It provides implementations for ARM_NEON (all quants) and AVX2
// (all quants except sub-4 bit i-quants).
//
// Main idea is that unpacking the quants and the block scales to
// be ready for dot products with the corresponding Q8_Y quants
// takes time (here 'Y' stands for K, 0, or 1, depending on quantization type).
// Hence, if we are performing a QX x Q8_Y matrix matrix
// multiplication (as needed for prompt processing), we can get
// a significant speedup by reusing the unpacked QX quants and scales
// for multiplication with several Q8_K columns. We also achieve fewer
// loads from memory, which is the main purpose of tiling in general
// purpose matrix multiplication packages.
#include <utility>
#include <array>
#endif
namespace
{
typedef
struct
{
int32_t
i1
;
int32_t
i2
;
}
mmid_row_mapping
;
struct
DataInfo
{
float
*
s
;
const
char
*
cy
;
size_t
bs
;
size_t
by
;
int
cur_y
=
0
;
int
ne11
;
const
mmid_row_mapping
*
row_mapping
=
nullptr
;
size_t
bs2
=
0
;
inline
const
char
*
src1_row
(
int
iy
)
const
{
if
(
!
row_mapping
)
return
cy
+
(
cur_y
+
iy
)
*
by
;
int
i11
=
row_mapping
[
cur_y
+
iy
]
.
i1
%
ne11
;
int
i12
=
row_mapping
[
cur_y
+
iy
]
.
i2
;
return
cy
+
(
i11
+
i12
*
ne11
)
*
by
;
}
inline
void
store
(
int
ix
,
int
iy
,
float
result
)
const
{
*
(
dst_row
(
iy
)
+
ix
)
=
result
;
//dst_row(iy)[ix] = result;
}
inline
float
*
dst_row
(
int
iy
)
const
{
if
(
!
row_mapping
)
return
s
+
(
cur_y
+
iy
)
*
bs
;
int
i12
=
row_mapping
[
cur_y
+
iy
]
.
i2
;
int
i1
=
row_mapping
[
cur_y
+
iy
]
.
i1
;
int
i2
=
i12
;
return
s
+
i1
*
bs
+
i2
*
bs2
;
}
};
typedef
void
(
*
mul_mat_t
)(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
);
struct
MulMat
{
std
::
array
<
mul_mat_t
,
8
>
funcs
=
{};
//inline void mul_mat_NxM(int n, const void * vx, size_t bx, DataInfo& info, int nrc_x, int nrc_y) {
IQK_NOINLINE
void
mul_mat_NxM
(
int
n
,
const
void
*
vx
,
size_t
bx
,
DataInfo
&
info
,
int
nrc_x
,
int
nrc_y
)
{
constexpr
int
k_x_step
=
64
;
// This works best on my Ryzen-7950X and M2 Max CPUs (but differences to other tile size are small)
int
n_step
=
(
nrc_y
-
info
.
cur_y
)
/
funcs
.
size
();
if
(
n_step
>
0
)
{
for
(
int
ix
=
0
;
ix
<
nrc_x
;
ix
+=
k_x_step
)
{
auto
this_info
=
info
;
this_info
.
s
+=
ix
;
int
this_nrc_x
=
ix
+
k_x_step
<=
nrc_x
?
k_x_step
:
nrc_x
-
ix
;
for
(
int
iy
=
0
;
iy
<
n_step
;
++
iy
)
{
funcs
.
back
()(
n
,
(
const
void
*
)((
const
char
*
)
vx
+
ix
*
bx
),
bx
,
this_info
,
this_nrc_x
);
this_info
.
cur_y
+=
funcs
.
size
();
}
}
info
.
cur_y
+=
funcs
.
size
()
*
n_step
;
}
int
n_left
=
nrc_y
-
info
.
cur_y
;
if
(
n_left
>
0
)
{
funcs
[
n_left
-
1
](
n
,
vx
,
bx
,
info
,
nrc_x
);
}
}
static
IQK_NOINLINE
bool
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
Ny
);
private
:
template
<
typename
Dequantizer
>
static
IQK_NOINLINE
void
set_functions
(
MulMat
&
m
);
};
inline
void
make_q4_scales
(
const
uint8_t
*
scales8
,
uint32_t
*
aux32
)
{
const
uint16_t
*
scales
=
(
const
uint16_t
*
)
scales8
;
const
uint32_t
a0
=
scales
[
0
]
|
(
scales
[
1
]
<<
16
);
const
uint32_t
a1
=
scales
[
2
]
|
(
scales
[
3
]
<<
16
);
const
uint32_t
a2
=
scales
[
4
]
|
(
scales
[
5
]
<<
16
);
aux32
[
3
]
=
((
a2
>>
4
)
&
0x0f0f0f0f
)
|
((
a1
>>
2
)
&
0x30303030
);
aux32
[
1
]
=
((
a2
>>
0
)
&
0x0f0f0f0f
)
|
((
a0
>>
2
)
&
0x30303030
);
aux32
[
2
]
=
a1
&
0x3f3f3f3f
;
aux32
[
0
]
=
a0
&
0x3f3f3f3f
;
}
const
uint64_t
keven_signs
[
128
]
=
{
0x0101010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0x010101010101ffff
,
0xff01010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0xff01010101ffffff
,
0xff010101ff010101
,
0x01010101ff0101ff
,
0x01010101ff01ff01
,
0xff010101ff01ffff
,
0x01010101ffff0101
,
0xff010101ffff01ff
,
0xff010101ffffff01
,
0x01010101ffffffff
,
0xff0101ff01010101
,
0x010101ff010101ff
,
0x010101ff0101ff01
,
0xff0101ff0101ffff
,
0x010101ff01ff0101
,
0xff0101ff01ff01ff
,
0xff0101ff01ffff01
,
0x010101ff01ffffff
,
0x010101ffff010101
,
0xff0101ffff0101ff
,
0xff0101ffff01ff01
,
0x010101ffff01ffff
,
0xff0101ffffff0101
,
0x010101ffffff01ff
,
0x010101ffffffff01
,
0xff0101ffffffffff
,
0xff01ff0101010101
,
0x0101ff01010101ff
,
0x0101ff010101ff01
,
0xff01ff010101ffff
,
0x0101ff0101ff0101
,
0xff01ff0101ff01ff
,
0xff01ff0101ffff01
,
0x0101ff0101ffffff
,
0x0101ff01ff010101
,
0xff01ff01ff0101ff
,
0xff01ff01ff01ff01
,
0x0101ff01ff01ffff
,
0xff01ff01ffff0101
,
0x0101ff01ffff01ff
,
0x0101ff01ffffff01
,
0xff01ff01ffffffff
,
0x0101ffff01010101
,
0xff01ffff010101ff
,
0xff01ffff0101ff01
,
0x0101ffff0101ffff
,
0xff01ffff01ff0101
,
0x0101ffff01ff01ff
,
0x0101ffff01ffff01
,
0xff01ffff01ffffff
,
0xff01ffffff010101
,
0x0101ffffff0101ff
,
0x0101ffffff01ff01
,
0xff01ffffff01ffff
,
0x0101ffffffff0101
,
0xff01ffffffff01ff
,
0xff01ffffffffff01
,
0x0101ffffffffffff
,
0xffff010101010101
,
0x01ff0101010101ff
,
0x01ff01010101ff01
,
0xffff01010101ffff
,
0x01ff010101ff0101
,
0xffff010101ff01ff
,
0xffff010101ffff01
,
0x01ff010101ffffff
,
0x01ff0101ff010101
,
0xffff0101ff0101ff
,
0xffff0101ff01ff01
,
0x01ff0101ff01ffff
,
0xffff0101ffff0101
,
0x01ff0101ffff01ff
,
0x01ff0101ffffff01
,
0xffff0101ffffffff
,
0x01ff01ff01010101
,
0xffff01ff010101ff
,
0xffff01ff0101ff01
,
0x01ff01ff0101ffff
,
0xffff01ff01ff0101
,
0x01ff01ff01ff01ff
,
0x01ff01ff01ffff01
,
0xffff01ff01ffffff
,
0xffff01ffff010101
,
0x01ff01ffff0101ff
,
0x01ff01ffff01ff01
,
0xffff01ffff01ffff
,
0x01ff01ffffff0101
,
0xffff01ffffff01ff
,
0xffff01ffffffff01
,
0x01ff01ffffffffff
,
0x01ffff0101010101
,
0xffffff01010101ff
,
0xffffff010101ff01
,
0x01ffff010101ffff
,
0xffffff0101ff0101
,
0x01ffff0101ff01ff
,
0x01ffff0101ffff01
,
0xffffff0101ffffff
,
0xffffff01ff010101
,
0x01ffff01ff0101ff
,
0x01ffff01ff01ff01
,
0xffffff01ff01ffff
,
0x01ffff01ffff0101
,
0xffffff01ffff01ff
,
0xffffff01ffffff01
,
0x01ffff01ffffffff
,
0xffffffff01010101
,
0x01ffffff010101ff
,
0x01ffffff0101ff01
,
0xffffffff0101ffff
,
0x01ffffff01ff0101
,
0xffffffff01ff01ff
,
0xffffffff01ffff01
,
0x01ffffff01ffffff
,
0x01ffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0x01ffffffff01ffff
,
0xffffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0xffffffffffffffff
,
};
}
bool
iqk_mul_mat
(
long
Nx
,
long
Ny
,
long
ne00
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
float
*
C
,
long
stride_C
,
int
ith
,
int
nth
)
{
MulMat
mm
;
int
row_size_q8
;
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
ne00
,
mm
,
row_size_q8
,
Ny
))
{
return
false
;
}
auto
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
auto
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
auto
first_x
=
ith
*
nrc_x
;
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
(
size_t
)
stride_C
,
(
size_t
)
row_size_q8
,
0
,
1
,
nullptr
,
0
};
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
return
true
;
}
bool
iqk_mul_mat_moe
(
long
Nx
,
long
Ny
,
long
ne00
,
int
ne11
,
int
typeA
,
const
void
*
A
,
const
void
*
B
,
float
*
C
,
long
nb1
,
long
nb2
,
const
void
*
vrow_mapping
,
int
ith
,
int
nth
)
{
const
mmid_row_mapping
*
row_mapping
=
(
const
mmid_row_mapping
*
)
vrow_mapping
;
assert
(
row_mapping
!=
nullptr
);
MulMat
mm
;
int
row_size_q8
;
if
(
!
MulMat
::
set_mul_mat
(
typeA
,
ne00
,
mm
,
row_size_q8
,
Ny
))
{
return
false
;
}
int
row_size_qx
=
ggml_row_size
((
ggml_type
)
typeA
,
ne00
);
int
nrc_x
=
(
Nx
+
nth
-
1
)
/
nth
;
int
first_x
=
ith
*
nrc_x
;
if
(
first_x
+
nrc_x
>
Nx
)
nrc_x
=
Nx
-
first_x
;
DataInfo
info
{
C
+
first_x
,
(
const
char
*
)
B
,
nb1
/
sizeof
(
float
),
(
size_t
)
row_size_q8
,
0
,
ne11
,
row_mapping
,
nb2
/
sizeof
(
float
)};
mm
.
mul_mat_NxM
(
ne00
,
(
const
char
*
)
A
+
row_size_qx
*
first_x
,
row_size_qx
,
info
,
nrc_x
,
Ny
);
return
true
;
}
#if defined __x86_64__
#if defined HAVE_FANCY_SIMD
#undef HAVE_FANCY_SIMD
#endif
#if defined(__AVX512F__) && defined(__AVX512VNNI__) && defined(__AVX512VL__) && defined(__AVX512BW__) && defined(__AVX512DQ__)
#define HAVE_FANCY_SIMD
#endif
namespace
{
inline
float
hsum_float_4
(
__m128
x
)
{
x
=
_mm_add_ps
(
x
,
_mm_movehl_ps
(
x
,
x
));
x
=
_mm_add_ss
(
x
,
_mm_movehdup_ps
(
x
));
return
_mm_cvtss_f32
(
x
);
}
inline
float
hsum_float_8
(
__m256
x
)
{
return
hsum_float_4
(
_mm_add_ps
(
_mm256_castps256_ps128
(
x
),
_mm256_extractf128_ps
(
x
,
1
)));
}
#define MM256_SET_M128I(a, b) _mm256_insertf128_si256(_mm256_castsi128_si256(b), (a), 1)
template
<
int
nrc
,
typename
block_q8
=
block_q8_K
>
struct
Q8
{
constexpr
static
int
nrc_y
=
nrc
;
Q8
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
y
[
iy
]
=
(
const
block_q8
*
)
info
.
src1_row
(
iy
);
}
#ifdef HAVE_FANCY_SIMD
inline
__m512i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm512_loadu_si512
((
const
__m512i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#else
inline
__m256i
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
+
j
);
}
#endif
inline
__m256i
load_bsums
(
int
iy
,
int
i
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
bsums
);
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
const
block_q8
*
y
[
nrc_y
];
};
// Handles q4_K and q5_K scales/mins
struct
Scales8K
{
template
<
typename
Q8
>
inline
__m256i
process_mins_and_scales
(
const
uint8_t
*
data
,
float
c
,
int
i
,
const
Q8
&
q8
,
__m256
*
accd
)
{
make_q4_scales
(
data
,
utmp
);
const
__m256i
mins_and_scales
=
_mm256_cvtepu8_epi16
(
_mm_set_epi32
(
utmp
[
3
],
utmp
[
2
],
utmp
[
1
],
utmp
[
0
]));
const
__m128i
mins128
=
_mm256_extracti128_si256
(
mins_and_scales
,
1
);
accum_mins
(
mins128
,
q8
,
i
,
c
,
accd
);
const
__m128i
sc128
=
_mm256_extracti128_si256
(
mins_and_scales
,
0
);
return
MM256_SET_M128I
(
sc128
,
sc128
);
}
#ifdef HAVE_FANCY_SIMD
template
<
typename
Q8
>
inline
__m512i
process_mins_and_scales_64
(
const
uint8_t
*
data
,
float
c
,
int
i
,
const
Q8
&
q8
,
__m256
*
accd
)
{
auto
scales
=
process_mins_and_scales
(
data
,
c
,
i
,
q8
,
accd
);
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales
),
scales
,
1
);
}
#endif
template
<
typename
Q8
>
inline
void
accum_mins
(
const
__m128i
&
mins128
,
const
Q8
&
q8
,
int
i
,
float
c
,
__m256
*
accd
)
const
{
const
__m256i
mins
=
MM256_SET_M128I
(
_mm_shuffle_epi8
(
mins128
,
shuffles
[
1
]),
_mm_shuffle_epi8
(
mins128
,
shuffles
[
0
]));
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
q8s
=
q8
.
load_bsums
(
iy
,
i
);
const
__m256i
prod
=
_mm256_madd_epi16
(
mins
,
q8s
);
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
c
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accd
[
iy
]);
}
}
#ifdef HAVE_FANCY_SIMD
const
__m512i
shuffles512
[
2
]
=
{
_mm512_set_epi64
(
0x0706070607060706
,
0x0302030203020302
,
0x0706070607060706
,
0x0302030203020302
,
0x0504050405040504
,
0x0100010001000100
,
0x0504050405040504
,
0x0100010001000100
),
_mm512_set_epi64
(
0x0f0e0f0e0f0e0f0e
,
0x0b0a0b0a0b0a0b0a
,
0x0f0e0f0e0f0e0f0e
,
0x0b0a0b0a0b0a0b0a
,
0x0d0c0d0c0d0c0d0c
,
0x0908090809080908
,
0x0d0c0d0c0d0c0d0c
,
0x0908090809080908
)
};
#endif
const
__m128i
shuffles
[
2
]
=
{
_mm_set_epi32
(
0x07060706
,
0x05040504
,
0x03020302
,
0x01000100
),
_mm_set_epi32
(
0x0f0e0f0e
,
0x0d0c0d0c
,
0x0b0a0b0a
,
0x09080908
)};
uint32_t
utmp
[
4
];
};
template
<
typename
Q8
>
inline
void
process_mins_16
(
const
__m256i
&
all_scales
,
const
Q8
&
q8
,
int
i
,
float
d
,
__m256
*
accm
)
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
prod
=
_mm256_madd_epi16
(
all_scales
,
q8
.
load_bsums
(
iy
,
i
));
accm
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
d
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
prod
),
accm
[
iy
]);
}
}
inline
void
prepare_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
const
__m128i
l_scales
=
_mm256_extracti128_si256
(
all_scales
,
0
);
const
__m128i
h_scales
=
_mm256_extracti128_si256
(
all_scales
,
1
);
scales
[
0
]
=
MM256_SET_M128I
(
l_scales
,
l_scales
);
scales
[
1
]
=
MM256_SET_M128I
(
h_scales
,
h_scales
);
}
struct
ScaleQ3
{
inline
__m128i
make_scales
(
const
uint16_t
*
s8
)
const
{
const
uint16_t
*
scales16
=
(
const
uint16_t
*
)
s8
;
uint32_t
aux0
=
scales16
[
0
]
|
(
scales16
[
1
]
<<
16
);
uint32_t
aux1
=
scales16
[
2
]
|
(
scales16
[
3
]
<<
16
);
uint32_t
aux2
=
scales16
[
4
]
|
(
scales16
[
5
]
<<
16
);
__m128i
scales128
=
_mm_set_epi32
(
((
aux1
>>
4
)
&
0x0f0f0f0f
)
|
((
aux2
>>
2
)
&
0x30303030
),
((
aux0
>>
4
)
&
0x0f0f0f0f
)
|
((
aux2
>>
0
)
&
0x30303030
),
(
aux1
&
0x0f0f0f0f
)
|
((
aux2
<<
2
)
&
0x30303030
),
(
aux0
&
0x0f0f0f0f
)
|
((
aux2
<<
4
)
&
0x30303030
));
return
_mm_add_epi8
(
scales128
,
m32
);
}
const
__m128i
m32
=
_mm_set1_epi8
(
-
32
);
};
struct
ScaleIQ4XS
{
inline
__m128i
make_scales
(
const
uint32_t
scales_l
,
const
uint16_t
scales_h
)
{
uint32_t
tmp32
=
scales_h
|
(
scales_h
<<
14
);
const
__m128i
sh
=
_mm_slli_epi16
(
_mm_and_si128
(
_mm_srlv_epi32
(
_mm_set1_epi32
(
tmp32
),
hshift
),
hmask
),
4
);
const
__m128i
sl
=
_mm_and_si128
(
_mm_srlv_epi32
(
_mm_set1_epi32
(
scales_l
),
lshift
),
lmask
);
return
_mm_add_epi16
(
_mm_or_si128
(
sh
,
_mm_cvtepi8_epi16
(
_mm_shuffle_epi8
(
sl
,
lshuffle
))),
m32
);
}
const
__m128i
hshift
=
_mm_set_epi32
(
12
,
8
,
4
,
0
);
const
__m128i
lshift
=
_mm_set_epi32
(
4
,
0
,
4
,
0
);
const
__m128i
hmask
=
_mm_set1_epi16
(
0x03
);
const
__m128i
lmask
=
_mm_set1_epi8
(
0xf
);
const
__m128i
lshuffle
=
_mm_set_epi32
(
0x07030602
,
0x05010400
,
0x07030602
,
0x05010400
);
const
__m128i
m32
=
_mm_set1_epi16
(
-
32
);
};
template
<
typename
Block
>
struct
BaseDequantizer
{
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
bx
(
bx
)
{}
inline
void
new_row
(
int
ix
)
{
x
=
(
const
Block
*
)((
const
char
*
)
vx
+
bx
*
ix
);
}
const
void
*
vx
;
size_t
bx
;
const
Block
*
x
;
float
d
;
};
#ifdef HAVE_FANCY_SIMD
//====================================== Zen4 ==================================================
struct
BlockPermuter
{
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
9
,
8
,
3
,
2
,
1
,
0
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
13
,
12
,
7
,
6
,
5
,
4
);
};
struct
Q4Bits
{
inline
void
prepare
(
const
uint8_t
*
q4
)
{
auto
q4bits
=
_mm512_loadu_si512
((
const
__m512i
*
)
q4
+
0
);
auto
tmp1
=
_mm512_and_si512
(
q4bits
,
ml
);
auto
tmp2
=
_mm512_and_si512
(
_mm512_srli_epi16
(
q4bits
,
4
),
ml
);
values
[
0
]
=
_mm512_permutex2var_epi64
(
tmp1
,
perm
.
permute1
,
tmp2
);
values
[
1
]
=
_mm512_permutex2var_epi64
(
tmp1
,
perm
.
permute2
,
tmp2
);
q4bits
=
_mm512_loadu_si512
((
const
__m512i
*
)
q4
+
1
);
tmp1
=
_mm512_and_si512
(
q4bits
,
ml
);
tmp2
=
_mm512_and_si512
(
_mm512_srli_epi16
(
q4bits
,
4
),
ml
);
values
[
2
]
=
_mm512_permutex2var_epi64
(
tmp1
,
perm
.
permute1
,
tmp2
);
values
[
3
]
=
_mm512_permutex2var_epi64
(
tmp1
,
perm
.
permute2
,
tmp2
);
}
inline
void
prepare64
(
const
uint8_t
*
q4
)
{
auto
q4bits
=
_mm512_loadu_si512
((
const
__m512i
*
)
q4
+
0
);
values
[
0
]
=
_mm512_and_si512
(
q4bits
,
ml
);
values
[
1
]
=
_mm512_and_si512
(
_mm512_srli_epi16
(
q4bits
,
4
),
ml
);
q4bits
=
_mm512_loadu_si512
((
const
__m512i
*
)
q4
+
1
);
values
[
2
]
=
_mm512_and_si512
(
q4bits
,
ml
);
values
[
3
]
=
_mm512_and_si512
(
_mm512_srli_epi16
(
q4bits
,
4
),
ml
);
}
__m512i
values
[
4
];
const
__m512i
ml
=
_mm512_set1_epi8
(
0xf
);
BlockPermuter
perm
;
};
struct
Q2Bits
{
inline
void
prepare
(
const
uint8_t
*
q2
)
{
auto
q2bits
=
_mm512_loadu_si512
((
const
__m512i
*
)
q2
);
auto
tmp
=
_mm512_srli_epi16
(
q2bits
,
2
);
values
[
0
]
=
_mm512_permutex2var_epi64
(
q2bits
,
perm
.
permute1
,
tmp
);
values
[
2
]
=
_mm512_permutex2var_epi64
(
q2bits
,
perm
.
permute2
,
tmp
);
values
[
1
]
=
_mm512_and_si512
(
_mm512_srli_epi16
(
values
[
0
],
4
),
ml
);
values
[
3
]
=
_mm512_and_si512
(
_mm512_srli_epi16
(
values
[
2
],
4
),
ml
);
values
[
0
]
=
_mm512_and_si512
(
values
[
0
],
ml
);
values
[
2
]
=
_mm512_and_si512
(
values
[
2
],
ml
);
}
__m512i
values
[
4
];
const
__m512i
ml
=
_mm512_set1_epi8
(
0x03
);
BlockPermuter
perm
;
};
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
bits
.
prepare
(
x
[
i
]
.
qs
);
auto
all_scales
=
s8k
.
process_mins_and_scales_64
(
x
[
i
]
.
scales
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
i
,
q8
,
accd
);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
0
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
1
]);
}
Q4Bits
bits
;
Scales8K
s8k
;
};
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_values
())
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
prepare
(
x
[
i
]
.
qs
);
auto
scales128
=
siq4
.
make_scales
(
*
(
const
uint32_t
*
)
x
[
i
]
.
scales_l
,
x
[
i
]
.
scales_h
);
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
auto
scales256
=
MM256_SET_M128I
(
scales128
,
scales128
);
auto
all_scales
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
scales256
),
scales256
,
1
);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
0
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
1
]);
}
static
__m512i
load_values
()
{
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
auto
val128
=
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
auto
val256
=
MM256_SET_M128I
(
val128
,
val128
);
return
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
val256
),
val256
,
1
);
}
inline
void
prepare
(
const
uint8_t
*
q4
)
{
bits
.
prepare64
(
q4
);
// We now have in bits.valuse[0]: 0...15, 32...47, 64...79, 96...111
// bits.valuse[1]: 16..31, 48...63, 80...95, 112..127
// etc.
auto
tmp
=
_mm512_permutex2var_epi64
(
bits
.
values
[
0
],
permute1
,
bits
.
values
[
1
]);
bits
.
values
[
1
]
=
_mm512_shuffle_epi8
(
values
,
_mm512_permutex2var_epi64
(
bits
.
values
[
0
],
permute2
,
bits
.
values
[
1
]));
bits
.
values
[
0
]
=
_mm512_shuffle_epi8
(
values
,
tmp
);
tmp
=
_mm512_permutex2var_epi64
(
bits
.
values
[
2
],
permute1
,
bits
.
values
[
3
]);
bits
.
values
[
3
]
=
_mm512_shuffle_epi8
(
values
,
_mm512_permutex2var_epi64
(
bits
.
values
[
2
],
permute2
,
bits
.
values
[
3
]));
bits
.
values
[
2
]
=
_mm512_shuffle_epi8
(
values
,
tmp
);
}
Q4Bits
bits
;
Scales8K
s8k
;
ScaleIQ4XS
siq4
;
const
__m512i
values
;
const
__m512i
permute1
=
_mm512_set_epi64
(
11
,
10
,
3
,
2
,
9
,
8
,
1
,
0
);
const
__m512i
permute2
=
_mm512_set_epi64
(
15
,
14
,
7
,
6
,
13
,
12
,
5
,
4
);
};
struct
HighBit5
{
inline
void
apply
(
const
uint8_t
*
h
,
Q4Bits
&
bits
)
{
auto
hbits256
=
_mm256_loadu_si256
((
const
__m256i
*
)
h
);
auto
hbits
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
hbits256
),
_mm256_srli_epi16
(
hbits256
,
1
),
1
);
bits
.
values
[
0
]
=
_mm512_or_si512
(
bits
.
values
[
0
],
_mm512_and_si512
(
_mm512_slli_epi16
(
hbits
,
4
),
mh
));
bits
.
values
[
1
]
=
_mm512_or_si512
(
bits
.
values
[
1
],
_mm512_and_si512
(
_mm512_slli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
2
]
=
_mm512_or_si512
(
bits
.
values
[
2
],
_mm512_and_si512
(
hbits
,
mh
));
bits
.
values
[
3
]
=
_mm512_or_si512
(
bits
.
values
[
3
],
_mm512_and_si512
(
_mm512_srli_epi16
(
hbits
,
2
),
mh
));
}
const
__m512i
mh
=
_mm512_set1_epi8
(
0x10
);
};
struct
HighBit3
{
inline
void
apply
(
const
uint8_t
*
h
,
Q2Bits
&
bits
)
{
auto
hbits256
=
_mm256_loadu_si256
((
const
__m256i
*
)
h
);
auto
hbits
=
_mm512_inserti32x8
(
_mm512_castsi256_si512
(
hbits256
),
_mm256_srli_epi16
(
hbits256
,
1
),
1
);
bits
.
values
[
0
]
=
_mm512_or_si512
(
bits
.
values
[
0
],
_mm512_and_si512
(
_mm512_slli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
1
]
=
_mm512_or_si512
(
bits
.
values
[
1
],
_mm512_and_si512
(
hbits
,
mh
));
bits
.
values
[
2
]
=
_mm512_or_si512
(
bits
.
values
[
2
],
_mm512_and_si512
(
_mm512_srli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
3
]
=
_mm512_or_si512
(
bits
.
values
[
3
],
_mm512_and_si512
(
_mm512_srli_epi16
(
hbits
,
4
),
mh
));
}
const
__m512i
mh
=
_mm512_set1_epi8
(
0x04
);
};
struct
DequantizerQ5K
final
:
public
BaseDequantizer
<
block_q5_K
>
{
DequantizerQ5K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
bits
.
prepare
(
x
[
i
]
.
qs
);
hbits
.
apply
(
x
[
i
]
.
qh
,
bits
);
auto
all_scales
=
s8k
.
process_mins_and_scales_64
(
x
[
i
]
.
scales
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
i
,
q8
,
accd
);
scales
[
0
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
0
]);
scales
[
1
]
=
_mm512_shuffle_epi8
(
all_scales
,
s8k
.
shuffles512
[
1
]);
}
Q4Bits
bits
;
HighBit5
hbits
;
Scales8K
s8k
;
};
struct
Scale16
{
inline
void
make_scales
(
const
__m128i
&
scales8
,
__m512i
*
scales
)
const
{
auto
all_scales8
=
MM256_SET_M128I
(
scales8
,
scales8
);
auto
scales1
=
_mm256_shuffle_epi8
(
all_scales8
,
shuffle1
);
auto
scales2
=
_mm256_shuffle_epi8
(
all_scales8
,
shuffle2
);
scales
[
0
]
=
_mm512_cvtepi8_epi16
(
scales1
);
scales
[
1
]
=
_mm512_cvtepi8_epi16
(
scales2
);
}
template
<
typename
Q8
>
inline
void
process_mins_and_scales
(
int
i
,
float
c
,
const
__m128i
&
mins8
,
const
__m128i
&
scales8
,
const
Q8
&
q8
,
__m256
*
accm
,
__m512i
*
scales
)
const
{
process_mins_16
(
_mm256_cvtepi8_epi16
(
mins8
),
q8
,
i
,
c
,
accm
);
make_scales
(
scales8
,
scales
);
}
const
__m256i
shuffle1
=
_mm256_set_epi32
(
0x07070707
,
0x03030303
,
0x06060606
,
0x02020202
,
0x05050505
,
0x01010101
,
0x04040404
,
0x00000000
);
const
__m256i
shuffle2
=
_mm256_set_epi32
(
0x0f0f0f0f
,
0x0b0b0b0b
,
0x0e0e0e0e
,
0x0a0a0a0a
,
0x0d0d0d0d
,
0x09090909
,
0x0c0c0c0c
,
0x08080808
);
};
struct
DequantizerQ2K
final
:
public
BaseDequantizer
<
block_q2_K
>
{
DequantizerQ2K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
bits
.
prepare
(
x
[
i
]
.
qs
);
const
__m128i
mins_and_scales
=
_mm_loadu_si128
((
const
__m128i
*
)
x
[
i
]
.
scales
);
const
__m128i
scales8
=
_mm_and_si128
(
mins_and_scales
,
m4
);
const
__m128i
mins8
=
_mm_and_si128
(
_mm_srli_epi16
(
mins_and_scales
,
4
),
m4
);
sc16
.
process_mins_and_scales
(
i
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
mins8
,
scales8
,
q8
,
accm
,
scales
);
}
Q2Bits
bits
;
Scale16
sc16
;
const
__m128i
m4
=
_mm_set1_epi8
(
0xf
);
};
struct
DequantizerQ3K
final
:
public
BaseDequantizer
<
block_q3_K
>
{
DequantizerQ3K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
bits
.
prepare
(
x
[
i
]
.
qs
);
hbits
.
apply
(
x
[
i
]
.
hmask
,
bits
);
auto
scales128
=
sc3
.
make_scales
((
const
uint16_t
*
)
x
[
i
]
.
scales
);
sc16
.
process_mins_and_scales
(
i
,
-
4.
f
*
d
,
scales128
,
scales128
,
q8
,
accm
,
scales
);
}
Q2Bits
bits
;
HighBit3
hbits
;
ScaleQ3
sc3
;
Scale16
sc16
;
const
__m128i
m4
=
_mm_set1_epi8
(
0xf
);
const
__m128i
m32
=
_mm_set1_epi8
(
-
32
);
};
struct
DequantizerQ6K
final
:
public
BaseDequantizer
<
block_q6_K
>
{
DequantizerQ6K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m512i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
bits
.
prepare64
(
x
[
i
]
.
ql
);
add_high_bits
(
x
[
i
]
.
qh
,
bits
);
auto
scales128
=
_mm_loadu_si128
((
const
__m128i
*
)
x
[
i
]
.
scales
);
sc16
.
process_mins_and_scales
(
i
,
-
32.
f
*
d
,
scales128
,
scales128
,
q8
,
accm
,
scales
);
}
inline
void
add_high_bits
(
const
uint8_t
*
qh
,
Q4Bits
&
bits
)
const
{
auto
hbits
=
_mm512_loadu_si512
((
const
__m512i
*
)
qh
);
auto
tmp1
=
_mm512_and_si512
(
_mm512_slli_epi16
(
hbits
,
4
),
mh
);
auto
tmp2
=
_mm512_and_si512
(
_mm512_slli_epi16
(
hbits
,
2
),
mh
);
bits
.
values
[
0
]
=
_mm512_or_si512
(
bits
.
values
[
0
],
_mm512_permutex2var_epi64
(
tmp1
,
bits
.
perm
.
permute1
,
tmp2
));
bits
.
values
[
2
]
=
_mm512_or_si512
(
bits
.
values
[
2
],
_mm512_permutex2var_epi64
(
tmp1
,
bits
.
perm
.
permute2
,
tmp2
));
tmp1
=
_mm512_and_si512
(
hbits
,
mh
);
tmp2
=
_mm512_and_si512
(
_mm512_srli_epi16
(
hbits
,
2
),
mh
);
bits
.
values
[
1
]
=
_mm512_or_si512
(
bits
.
values
[
1
],
_mm512_permutex2var_epi64
(
tmp1
,
bits
.
perm
.
permute1
,
tmp2
));
bits
.
values
[
3
]
=
_mm512_or_si512
(
bits
.
values
[
3
],
_mm512_permutex2var_epi64
(
tmp1
,
bits
.
perm
.
permute2
,
tmp2
));
}
Q4Bits
bits
;
HighBit3
hbits
;
Scale16
sc16
;
const
__m512i
mh
=
_mm512_set1_epi8
(
0x30
);
};
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accm
[
nrc_y
];
__m512
accd
[
nrc_y
];
__m512i
scales
[
2
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm512_setzero_ps
();
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accm
,
scales
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m512i
p1
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
));
const
__m512i
p2
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
));
const
__m512i
p3
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
));
const
__m512i
p4
=
_mm512_dpbusd_epi32
(
_mm512_setzero_si512
(),
deq
.
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
));
auto
sumi
=
_mm512_dpwssd_epi32
(
_mm512_setzero_si512
(),
scales
[
0
],
_mm512_packs_epi32
(
p1
,
p2
));
sumi
=
_mm512_dpwssd_epi32
(
sumi
,
scales
[
1
],
_mm512_packs_epi32
(
p3
,
p4
));
accd
[
iy
]
=
_mm512_fmadd_ps
(
_mm512_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm512_cvtepi32_ps
(
sumi
),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
sum256
=
_mm256_add_ps
(
_mm512_castps512_ps256
(
accd
[
iy
]),
_mm512_extractf32x8_ps
(
accd
[
iy
],
1
));
info
.
store
(
ix
,
iy
,
hsum_float_8
(
_mm256_add_ps
(
accm
[
iy
],
sum256
)));
}
}
}
#else
// ===================================== Vanilla AVX2 =====================================
struct
Q4Bits
{
inline
void
prepare
(
const
uint8_t
*
q4
,
int
j
)
{
auto
q4bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
+
2
*
j
+
0
);
values
[
0
]
=
_mm256_and_si256
(
q4bits
,
ml
);
values
[
1
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits
,
4
),
ml
);
q4bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
+
2
*
j
+
1
);
values
[
2
]
=
_mm256_and_si256
(
q4bits
,
ml
);
values
[
3
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits
,
4
),
ml
);
}
inline
void
prepare64
(
const
uint8_t
*
q4
,
int
j
)
{
auto
q4bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
+
2
*
j
+
0
);
values
[
0
]
=
_mm256_and_si256
(
q4bits
,
ml
);
values
[
2
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits
,
4
),
ml
);
q4bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q4
+
2
*
j
+
1
);
values
[
1
]
=
_mm256_and_si256
(
q4bits
,
ml
);
values
[
3
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q4bits
,
4
),
ml
);
}
inline
void
prepare16
(
const
uint8_t
*
q4
,
int
j
)
{
values
[
0
]
=
dequant16
(
q4
+
64
*
j
+
0
);
values
[
1
]
=
dequant16
(
q4
+
64
*
j
+
16
);
values
[
2
]
=
dequant16
(
q4
+
64
*
j
+
32
);
values
[
3
]
=
dequant16
(
q4
+
64
*
j
+
48
);
}
inline
__m256i
dequant16
(
const
uint8_t
*
qs
)
const
{
const
__m128i
aux128
=
_mm_loadu_si128
((
const
__m128i
*
)
qs
);
const
__m256i
aux256
=
MM256_SET_M128I
(
_mm_srli_epi16
(
aux128
,
4
),
aux128
);
return
_mm256_and_si256
(
ml
,
aux256
);
};
__m256i
values
[
4
];
const
__m256i
ml
=
_mm256_set1_epi8
(
0xf
);
};
struct
Q2Bits
{
inline
void
prepare
(
const
uint8_t
*
q2
,
int
j
)
{
auto
q2bits
=
_mm256_loadu_si256
((
const
__m256i
*
)
q2
+
j
);
values
[
0
]
=
_mm256_and_si256
(
q2bits
,
ml
);
values
[
1
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q2bits
,
2
),
ml
);
values
[
2
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q2bits
,
4
),
ml
);
values
[
3
]
=
_mm256_and_si256
(
_mm256_srli_epi16
(
q2bits
,
6
),
ml
);
}
__m256i
values
[
4
];
const
__m256i
ml
=
_mm256_set1_epi8
(
0x03
);
};
struct
HighBit5
{
inline
void
load
(
const
uint8_t
*
h
)
{
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
h
);
}
inline
void
apply
(
Q4Bits
&
bits
,
bool
do_shift
)
{
bits
.
values
[
0
]
=
_mm256_or_si256
(
bits
.
values
[
0
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
4
),
mh
));
bits
.
values
[
1
]
=
_mm256_or_si256
(
bits
.
values
[
1
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
3
),
mh
));
bits
.
values
[
2
]
=
_mm256_or_si256
(
bits
.
values
[
2
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
3
]
=
_mm256_or_si256
(
bits
.
values
[
3
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
1
),
mh
));
if
(
do_shift
)
{
hbits
=
_mm256_srli_epi16
(
hbits
,
4
);
}
}
const
__m256i
mh
=
_mm256_set1_epi8
(
0x10
);
__m256i
hbits
;
};
struct
HighBit3
{
inline
void
load
(
const
uint8_t
*
h
)
{
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
h
);
}
inline
void
apply
(
Q2Bits
&
bits
,
bool
do_shift
)
{
bits
.
values
[
0
]
=
_mm256_or_si256
(
bits
.
values
[
0
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
1
]
=
_mm256_or_si256
(
bits
.
values
[
1
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
1
),
mh
));
bits
.
values
[
2
]
=
_mm256_or_si256
(
bits
.
values
[
2
],
_mm256_and_si256
(
hbits
,
mh
));
bits
.
values
[
3
]
=
_mm256_or_si256
(
bits
.
values
[
3
],
_mm256_and_si256
(
_mm256_srli_epi16
(
hbits
,
1
),
mh
));
if
(
do_shift
)
{
hbits
=
_mm256_srli_epi16
(
hbits
,
4
);
}
}
const
__m256i
mh
=
_mm256_set1_epi8
(
0x04
);
__m256i
hbits
;
};
inline
__m256i
get_scale_shuffle_8
(
int
i
)
{
return
_mm256_set1_epi16
((
2
*
i
)
|
((
2
*
i
+
1
)
<<
8
));
}
inline
void
set_scales_8
(
const
__m256i
&
all_scales
,
int
j
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_8
(
4
*
j
+
3
));
}
template
<
typename
Q8
,
typename
Bits
>
inline
void
multiply_add
(
const
Bits
&
bits
,
const
__m256i
*
scales
,
int
j
,
int
i
,
const
Q8
&
q8
,
__m256i
*
sumi
)
{
if
(
j
==
0
)
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
0
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
1
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
2
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
3
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
_mm256_add_epi32
(
p1
,
p3
),
_mm256_add_epi32
(
p2
,
p4
));
}
}
else
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
const
__m256i
p1
=
_mm256_madd_epi16
(
scales
[
0
],
_mm256_maddubs_epi16
(
bits
.
values
[
0
],
q8
.
load_quants
(
iy
,
i
,
4
)));
const
__m256i
p2
=
_mm256_madd_epi16
(
scales
[
1
],
_mm256_maddubs_epi16
(
bits
.
values
[
1
],
q8
.
load_quants
(
iy
,
i
,
5
)));
const
__m256i
p3
=
_mm256_madd_epi16
(
scales
[
2
],
_mm256_maddubs_epi16
(
bits
.
values
[
2
],
q8
.
load_quants
(
iy
,
i
,
6
)));
const
__m256i
p4
=
_mm256_madd_epi16
(
scales
[
3
],
_mm256_maddubs_epi16
(
bits
.
values
[
3
],
q8
.
load_quants
(
iy
,
i
,
7
)));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p1
,
p3
));
sumi
[
iy
]
=
_mm256_add_epi32
(
sumi
[
iy
],
_mm256_add_epi32
(
p2
,
p4
));
}
}
}
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
__m256i
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
return
s8k
.
process_mins_and_scales
(
x
[
i
]
.
scales
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
i
,
q8
,
accd
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
,
j
);
}
Q4Bits
bits
;
Scales8K
s8k
;
};
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
),
values
(
load_values
())
{}
template
<
typename
Q8
>
inline
__m256i
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
auto
scales128
=
siq4
.
make_scales
(
*
(
const
uint32_t
*
)
x
[
i
]
.
scales_l
,
x
[
i
]
.
scales_h
);
s8k
.
accum_mins
(
scales128
,
q8
,
i
,
-
128.
f
*
d
,
accd
);
return
MM256_SET_M128I
(
scales128
,
scales128
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare16
(
x
[
i
]
.
qs
,
j
);
bits
.
values
[
0
]
=
_mm256_shuffle_epi8
(
values
,
bits
.
values
[
0
]);
bits
.
values
[
1
]
=
_mm256_shuffle_epi8
(
values
,
bits
.
values
[
1
]);
bits
.
values
[
2
]
=
_mm256_shuffle_epi8
(
values
,
bits
.
values
[
2
]);
bits
.
values
[
3
]
=
_mm256_shuffle_epi8
(
values
,
bits
.
values
[
3
]);
}
static
__m256i
load_values
()
{
static
const
uint8_t
kvalues_iq4nl
[
16
]
=
{
1
,
24
,
45
,
63
,
79
,
93
,
106
,
118
,
129
,
141
,
153
,
166
,
181
,
197
,
217
,
241
};
auto
val128
=
_mm_loadu_si128
((
const
__m128i
*
)
kvalues_iq4nl
);
return
MM256_SET_M128I
(
val128
,
val128
);
}
Q4Bits
bits
;
Scales8K
s8k
;
ScaleIQ4XS
siq4
;
const
__m256i
values
;
};
struct
DequantizerQ5K
final
:
public
BaseDequantizer
<
block_q5_K
>
{
DequantizerQ5K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
__m256i
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accd
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
hbits
.
load
(
x
[
i
]
.
qh
);
return
s8k
.
process_mins_and_scales
(
x
[
i
]
.
scales
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
i
,
q8
,
accd
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
,
j
);
hbits
.
apply
(
bits
,
j
==
0
);
}
Q4Bits
bits
;
HighBit5
hbits
;
Scales8K
s8k
;
};
template
<
typename
Q8
>
inline
void
process_mins_and_scales_16
(
const
__m128i
&
scales128
,
const
Q8
&
q8
,
int
i
,
float
d
,
__m256
*
accm
,
__m256i
*
scales
)
{
const
__m256i
all_scales
=
_mm256_cvtepi8_epi16
(
scales128
);
process_mins_16
(
all_scales
,
q8
,
i
,
d
,
accm
);
prepare_scales_16
(
all_scales
,
scales
);
}
struct
DequantizerQ3K
final
:
public
BaseDequantizer
<
block_q3_K
>
{
DequantizerQ3K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m256i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
hbits
.
load
(
x
[
i
]
.
hmask
);
process_mins_and_scales_16
(
sc3
.
make_scales
((
const
uint16_t
*
)
x
[
i
]
.
scales
),
q8
,
i
,
-
4.
f
*
d
,
accm
,
scales
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
,
j
);
hbits
.
apply
(
bits
,
j
==
0
);
}
Q2Bits
bits
;
HighBit3
hbits
;
ScaleQ3
sc3
;
const
__m128i
m32
=
_mm_set1_epi8
(
-
32
);
};
struct
DequantizerQ2K
final
:
public
BaseDequantizer
<
block_q2_K
>
{
DequantizerQ2K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m256i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
const
__m128i
mins_and_scales
=
_mm_loadu_si128
((
const
__m128i
*
)
x
[
i
]
.
scales
);
const
__m128i
scales8
=
_mm_and_si128
(
mins_and_scales
,
m4
);
const
__m128i
mins8
=
_mm_and_si128
(
_mm_srli_epi16
(
mins_and_scales
,
4
),
m4
);
process_mins_16
(
_mm256_cvtepi8_epi16
(
mins8
),
q8
,
i
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
),
accm
);
prepare_scales_16
(
_mm256_cvtepi8_epi16
(
scales8
),
scales
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
,
j
);
}
Q2Bits
bits
;
const
__m128i
m4
=
_mm_set1_epi8
(
0xf
);
};
struct
DequantizerQ6K
final
:
public
BaseDequantizer
<
block_q6_K
>
{
DequantizerQ6K
(
const
void
*
vx
,
size_t
bx
)
:
BaseDequantizer
(
vx
,
bx
)
{}
template
<
typename
Q8
>
inline
void
new_block
(
int
i
,
const
Q8
&
q8
,
__m256
*
accm
,
__m256i
*
scales
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
process_mins_and_scales_16
(
_mm_loadu_si128
((
const
__m128i
*
)
x
[
i
]
.
scales
),
q8
,
i
,
-
32.
f
*
d
,
accm
,
scales
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare64
(
x
[
i
]
.
ql
,
j
);
auto
hbits
=
_mm256_loadu_si256
((
const
__m256i
*
)
x
[
i
]
.
qh
+
j
);
bits
.
values
[
0
]
=
_mm256_or_si256
(
bits
.
values
[
0
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
4
),
mh
));
bits
.
values
[
1
]
=
_mm256_or_si256
(
bits
.
values
[
1
],
_mm256_and_si256
(
_mm256_slli_epi16
(
hbits
,
2
),
mh
));
bits
.
values
[
2
]
=
_mm256_or_si256
(
bits
.
values
[
2
],
_mm256_and_si256
(
hbits
,
mh
));
bits
.
values
[
3
]
=
_mm256_or_si256
(
bits
.
values
[
3
],
_mm256_and_si256
(
_mm256_srli_epi16
(
hbits
,
2
),
mh
));
}
Q4Bits
bits
;
const
__m256i
mh
=
_mm256_set1_epi8
(
0x30
);
};
inline
__m256i
get_scale_shuffle_16
(
int
i
)
{
static
const
uint8_t
k_shuffle
[
128
]
=
{
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
0
,
1
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
2
,
3
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
4
,
5
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
6
,
7
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
8
,
9
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
10
,
11
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
12
,
13
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
14
,
15
,
};
return
_mm256_loadu_si256
((
const
__m256i
*
)
k_shuffle
+
i
);
}
inline
void
set_scales_16
(
const
__m256i
&
all_scales
,
__m256i
*
scales
)
{
scales
[
0
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
0
));
scales
[
1
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
1
));
scales
[
2
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
2
));
scales
[
3
]
=
_mm256_shuffle_epi8
(
all_scales
,
get_scale_shuffle_16
(
3
));
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qY_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
__m256i
all_scales
[
2
];
__m256i
scales
[
4
];
__m256
accd
[
nrc_y
];
Dequantizer
deq
(
vx
,
bx
);
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq
.
new_row
(
ix
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm256_setzero_ps
();
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
deq
.
new_block
(
i
,
q8
,
accd
,
all_scales
);
__m256i
sumi
[
nrc_y
];
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
);
set_scales_16
(
all_scales
[
j
],
scales
);
multiply_add
(
deq
.
bits
,
scales
,
j
,
i
,
q8
,
sumi
);
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
accd
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)),
_mm256_cvtepi32_ps
(
sumi
[
iy
]),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
hsum_float_8
(
accd
[
iy
]));
}
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
mul_mat_qX_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
);
__m256
accd
[
nrc_y
];
__m256i
scales
[
4
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accd
[
iy
]
=
_mm256_setzero_ps
();
deq
.
new_row
(
ix
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
auto
all_scales
=
deq
.
new_block
(
i
,
q8
,
accd
);
__m256i
sumi
[
nrc_y
];
for
(
int
j
=
0
;
j
<
QK_K
/
128
;
++
j
)
{
deq
.
prepare
(
i
,
j
);
set_scales_8
(
all_scales
,
j
,
scales
);
multiply_add
(
deq
.
bits
,
scales
,
j
,
i
,
q8
,
sumi
);
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
const
__m256
vd
=
_mm256_set1_ps
(
deq
.
d
*
q8
.
scale
(
iy
,
i
));
accd
[
iy
]
=
_mm256_fmadd_ps
(
vd
,
_mm256_cvtepi32_ps
(
sumi
[
iy
]),
accd
[
iy
]);
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
hsum_float_8
(
accd
[
iy
]));
}
}
}
#endif // Zen4 or vanilla AVX2
//
// ============================== Legacy quants
//
struct
DotHelper
{
const
__m256i
m1
=
_mm256_set1_epi16
(
1
);
#if defined(__AVX512VNNI__) && defined(__AVX512VL__)
inline
__m256i
dot
(
__m256i
x
,
__m256i
y
)
const
{
return
_mm256_dpbusd_epi32
(
_mm256_setzero_si256
(),
x
,
y
);
}
#else
inline
__m256i
dot
(
__m256i
x
,
__m256i
y
)
const
{
return
_mm256_madd_epi16
(
m1
,
_mm256_maddubs_epi16
(
x
,
y
));
}
#endif
};
struct
SignedDot
{
DotHelper
helper
;
inline
__m256i
compute
(
__m256i
x
,
__m256i
y
)
const
{
return
helper
.
dot
(
_mm256_sign_epi8
(
x
,
x
),
_mm256_sign_epi8
(
y
,
x
));
}
};
struct
UnsignedDot
{
DotHelper
helper
;
inline
__m256i
compute
(
__m256i
x
,
__m256i
y
)
const
{
return
helper
.
dot
(
x
,
y
);
}
};
template
<
typename
Q8
,
typename
Dot
>
struct
Sum4
{
Dot
dot
;
inline
__m256i
compute
(
const
__m256i
*
qx
,
const
Q8
*
y
)
const
{
const
__m256i
p0
=
dot
.
compute
(
qx
[
0
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
0
]
.
qs
));
const
__m256i
p1
=
dot
.
compute
(
qx
[
1
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
1
]
.
qs
));
const
__m256i
p2
=
dot
.
compute
(
qx
[
2
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
2
]
.
qs
));
const
__m256i
p3
=
dot
.
compute
(
qx
[
3
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
3
]
.
qs
));
const
__m256i
p01
=
_mm256_madd_epi16
(
dot
.
helper
.
m1
,
_mm256_packs_epi32
(
p0
,
p1
));
// 0,0, 1,1, 0,0, 1,1
const
__m256i
p23
=
_mm256_madd_epi16
(
dot
.
helper
.
m1
,
_mm256_packs_epi32
(
p2
,
p3
));
// 2,2, 3,3, 2,2, 3,3
return
_mm256_madd_epi16
(
dot
.
helper
.
m1
,
_mm256_packs_epi32
(
p01
,
p23
));
// 0,1,2,3, 0,1,2,3
}
};
struct
Sum4_Q8
{
SignedDot
dot
;
static
inline
__m256i
add1
(
__m256i
a
,
__m256i
b
)
{
return
_mm256_add_epi32
(
_mm256_unpacklo_epi32
(
a
,
b
),
_mm256_unpackhi_epi32
(
a
,
b
));
}
static
inline
__m256i
add2
(
__m256i
a
,
__m256i
b
)
{
return
_mm256_add_epi32
(
_mm256_unpacklo_epi64
(
a
,
b
),
_mm256_unpackhi_epi64
(
a
,
b
));
}
inline
__m256i
compute
(
const
__m256i
*
qx
,
const
block_q8_0
*
y
)
const
{
const
__m256i
p0
=
dot
.
compute
(
qx
[
0
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
0
]
.
qs
));
const
__m256i
p1
=
dot
.
compute
(
qx
[
1
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
1
]
.
qs
));
const
__m256i
p2
=
dot
.
compute
(
qx
[
2
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
2
]
.
qs
));
const
__m256i
p3
=
dot
.
compute
(
qx
[
3
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
3
]
.
qs
));
const
__m256i
p01
=
add1
(
p0
,
p1
);
// 0,1, 0,1, 0,1, 0,1
const
__m256i
p23
=
add1
(
p2
,
p3
);
// 2,3, 2,3, 2,3, 2,3
return
add2
(
p01
,
p23
);
// returns 0,1,2,3, 0,1,2,3
}
};
struct
ScaleHelperQ_0
{
ggml_half
scales8
[
4
];
template
<
typename
Q
>
inline
__m128
prepare4
(
const
Q
*
y
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
scales8
[
j
]
=
y
[
j
]
.
d
;
return
_mm_cvtph_ps
(
_mm_loadl_epi64
((
const
__m128i
*
)
scales8
));
}
template
<
typename
Q
>
inline
__m128
prepare4
(
__m128
other_scales
,
const
Q
*
y
)
{
return
_mm_mul_ps
(
other_scales
,
prepare4
<
Q
>
(
y
));
}
template
<
typename
Q
>
inline
float
prepare1
(
const
Q
*
y
)
const
{
return
GGML_FP16_TO_FP32
(
y
->
d
);
}
template
<
typename
Q
>
inline
float
prepare1
(
float
d
,
const
Q
*
y
)
const
{
return
d
*
prepare1
(
y
);
}
};
struct
ScaleHelperQ_1
{
uint32_t
scales8
[
4
];
const
__m128i
shuffle
=
_mm_set_epi16
(
0x0f0e
,
0x0b0a
,
0x0706
,
0x0302
,
0x0d0c
,
0x0908
,
0x0504
,
0x0100
);
template
<
typename
Q
>
inline
__m256
prepare4
(
const
Q
*
y
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
// it is slightly faster to directly dereference (const uint32 *)&y[j].d, but some compilers
// complain that this breaks strict-aliasing rules.
memcpy
(
scales8
+
j
,
&
y
[
j
]
.
d
,
sizeof
(
uint32_t
));
}
return
_mm256_cvtph_ps
(
_mm_shuffle_epi8
(
_mm_loadu_si128
((
const
__m128i
*
)
scales8
),
shuffle
));
}
template
<
typename
Q
>
inline
__m256
prepare4
(
__m256
other_scales
,
const
Q
*
y
)
{
return
_mm256_mul_ps
(
other_scales
,
prepare4
<
Q
>
(
y
));
}
template
<
typename
Q
>
inline
std
::
pair
<
float
,
float
>
prepare1
(
const
Q
*
y
)
const
{
return
std
::
make_pair
(
GGML_FP16_TO_FP32
(
y
->
d
),
GGML_FP16_TO_FP32
(
y
->
m
));
}
template
<
typename
Q
>
inline
std
::
pair
<
float
,
float
>
prepare1
(
const
std
::
pair
<
float
,
float
>&
dm
,
const
Q
*
y
)
const
{
return
std
::
make_pair
(
dm
.
first
*
GGML_FP16_TO_FP32
(
y
->
d
),
dm
.
second
*
GGML_FP16_TO_FP32
(
y
->
m
));
}
std
::
pair
<
float
,
float
>
inline
prepare1
(
const
std
::
pair
<
float
,
float
>&
dm
,
const
block_q8_1
*
y
)
const
{
return
std
::
make_pair
(
dm
.
first
*
GGML_FP16_TO_FP32
(
y
->
d
),
dm
.
second
*
GGML_FP16_TO_FP32
(
y
->
s
));
}
};
struct
MinusType0
{
inline
__m256
compute
(
__m128
d
,
int
)
const
{
return
_mm256_set_m128
(
d
,
d
);
}
inline
float
compute
(
float
d
,
int
)
const
{
return
d
;
}
inline
float
result
(
__m256
acc
,
int
)
const
{
return
hsum_float_8
(
acc
);
}
};
template
<
int
nrc_y
>
struct
MinusType1
{
__m128
accm
[
nrc_y
];
MinusType1
()
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
accm
[
iy
]
=
_mm_setzero_ps
();
}
inline
__m256
compute
(
__m256
dm
,
int
iy
)
{
const
__m128
d
=
_mm256_castps256_ps128
(
dm
);
const
__m128
m
=
_mm256_extractf128_ps
(
dm
,
1
);
accm
[
iy
]
=
_mm_add_ps
(
accm
[
iy
],
m
);
return
_mm256_set_m128
(
d
,
d
);
}
inline
float
compute
(
const
std
::
pair
<
float
,
float
>&
dm
,
int
iy
)
{
accm
[
iy
]
=
_mm_add_ps
(
accm
[
iy
],
_mm_set1_ps
(
dm
.
second
*
0.25
f
));
return
dm
.
first
;
}
inline
float
result
(
__m256
acc
,
int
iy
)
const
{
const
__m128
sum
=
_mm_add_ps
(
_mm256_castps256_ps128
(
acc
),
_mm256_extractf128_ps
(
acc
,
1
));
return
hsum_float_4
(
_mm_add_ps
(
sum
,
accm
[
iy
]));
}
};
template
<
typename
Minus
,
int
nrc_y
,
bool
is_multiple_of_4
>
struct
AccumT
{
__m256
acc
[
nrc_y
];
Minus
accm
;
AccumT
()
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
acc
[
iy
]
=
_mm256_setzero_ps
();
}
template
<
typename
Unpacker
,
typename
Scales
,
typename
Sum
,
typename
Q8
>
inline
void
compute
(
int
nb
,
Unpacker
&
unp
,
Scales
&
scales
,
Sum
&
sum
,
const
Q8
**
y
,
const
DataInfo
&
info
,
int
ix
)
{
auto
qx
=
unp
.
quants
();
__m256
dall
[
nrc_y
];
for
(
int
i
=
0
;
i
<
nb
/
4
;
++
i
)
{
auto
other_scales
=
unp
.
set_block_4
(
i
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
s12
=
scales
.
prepare4
(
other_scales
,
y
[
iy
]
+
4
*
i
);
dall
[
iy
]
=
accm
.
compute
(
s12
,
iy
);
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
pall
=
sum
.
compute
(
qx
,
y
[
iy
]
+
4
*
i
);
acc
[
iy
]
=
_mm256_fmadd_ps
(
dall
[
iy
],
_mm256_cvtepi32_ps
(
pall
),
acc
[
iy
]);
}
}
if
(
!
is_multiple_of_4
)
{
for
(
int
i
=
4
*
(
nb
/
4
);
i
<
nb
;
++
i
)
{
auto
other_scales
=
unp
.
set_block
(
i
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
auto
s12
=
scales
.
prepare1
(
other_scales
,
y
[
iy
]
+
i
);
auto
d
=
accm
.
compute
(
s12
,
iy
);
const
__m256i
p0
=
sum
.
dot
.
compute
(
qx
[
0
],
_mm256_loadu_si256
((
const
__m256i
*
)
y
[
iy
][
i
]
.
qs
));
acc
[
iy
]
=
_mm256_fmadd_ps
(
_mm256_set1_ps
(
d
),
_mm256_cvtepi32_ps
(
p0
),
acc
[
iy
]);
}
}
}
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
accm
.
result
(
acc
[
iy
],
iy
));
//s[iy*bs] = accm.result(acc[iy], iy);
}
}
};
template
<
int
nrc_y
,
bool
is_multiple_of_4
>
using
AccumType0
=
AccumT
<
MinusType0
,
nrc_y
,
is_multiple_of_4
>
;
template
<
int
nrc_y
,
bool
is_multiple_of_4
>
using
AccumType1
=
AccumT
<
MinusType1
<
nrc_y
>
,
nrc_y
,
is_multiple_of_4
>
;
using
Sum4Type0
=
Sum4
<
block_q8_0
,
SignedDot
>
;
using
Sum4Type1
=
Sum4
<
block_q8_1
,
UnsignedDot
>
;
template
<
typename
Unpacker
,
typename
Sum4Type
,
typename
AccumType
,
typename
Scales
,
typename
Q8
,
int
nrc_y
>
void
mul_mat_qX_q8_Helper
(
int
nb
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
const
Q8
**
y
,
int
nrc_x
)
{
Unpacker
unp
(
vx
,
bx
);
Sum4Type
sum4
;
Scales
scales
;
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
unp
.
set_row
(
ix
);
AccumType
accum
;
accum
.
compute
(
nb
,
unp
,
scales
,
sum4
,
y
,
info
,
ix
);
}
}
template
<
typename
Unpacker
,
int
nrc_y
>
void
mul_mat_qX_0_q8_0_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
Unpacker
::
block_size
()
==
0
);
Q8
<
nrc_y
,
block_q8_0
>
q8
(
info
);
int
nb
=
n
/
Unpacker
::
block_size
();
if
(
nb
%
4
==
0
)
{
mul_mat_qX_q8_Helper
<
Unpacker
,
Sum4Type0
,
AccumType0
<
nrc_y
,
true
>
,
ScaleHelperQ_0
,
block_q8_0
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
else
{
mul_mat_qX_q8_Helper
<
Unpacker
,
Sum4Type0
,
AccumType0
<
nrc_y
,
false
>
,
ScaleHelperQ_0
,
block_q8_0
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
}
template
<
typename
Unpacker
,
int
nrc_y
>
void
mul_mat_qX_1_q8_1_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
Unpacker
::
block_size
()
==
0
);
Q8
<
nrc_y
,
block_q8_1
>
q8
(
info
);
int
nb
=
n
/
Unpacker
::
block_size
();
if
(
nb
%
4
==
0
)
{
mul_mat_qX_q8_Helper
<
Unpacker
,
Sum4Type1
,
AccumType1
<
nrc_y
,
true
>
,
ScaleHelperQ_1
,
block_q8_1
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
else
{
mul_mat_qX_q8_Helper
<
Unpacker
,
Sum4Type1
,
AccumType1
<
nrc_y
,
false
>
,
ScaleHelperQ_1
,
block_q8_1
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
}
struct
Dequantizer4bit
{
const
__m256i
m4
=
_mm256_set1_epi8
(
0xf
);
inline
__m256i
dequant
(
const
uint8_t
*
qs
)
const
{
const
__m128i
aux128
=
_mm_loadu_si128
((
const
__m128i
*
)
qs
);
return
_mm256_and_si256
(
MM256_SET_M128I
(
_mm_srli_epi16
(
aux128
,
4
),
aux128
),
m4
);
}
};
struct
Q8_0_Dequantizer
{
inline
__m256i
dequant
(
const
block_q8_0
*
x
)
const
{
return
_mm256_loadu_si256
((
const
__m256i
*
)
x
->
qs
);
}
};
struct
Q4_0_Dequantizer
{
Dequantizer4bit
b4
;
const
__m256i
m8
=
_mm256_set1_epi8
(
-
8
);
inline
__m256i
dequant
(
const
block_q4_0
*
x
)
const
{
return
_mm256_add_epi8
(
b4
.
dequant
(
x
->
qs
),
m8
);
}
};
struct
Q4_1_Dequantizer
{
Dequantizer4bit
b4
;
inline
__m256i
dequant
(
const
block_q4_1
*
x
)
const
{
return
b4
.
dequant
(
x
->
qs
);
}
};
struct
HBitDequantizer
{
const
__m256i
shuffle
=
_mm256_set_epi64x
(
0x0303030303030303
,
0x0202020202020202
,
0x0101010101010101
,
0x0000000000000000
);
const
__m256i
mask
=
_mm256_set1_epi64x
(
0x7fbfdfeff7fbfdfe
);
const
__m256i
minus1
=
_mm256_set1_epi64x
(
-
1
);
inline
__m256i
to_bytes
(
const
uint8_t
*
bits
)
const
{
// Note: Data in all ggml quants is at least 2-byte aligned.
// => we can cast to uint16_t and use or on two consecutive entries
// which is faster than memcpy
const
uint16_t
*
aux16
=
(
const
uint16_t
*
)
bits
;
const
uint32_t
aux32
=
aux16
[
0
]
|
(
aux16
[
1
]
<<
16
);
//uint32_t aux32; memcpy(&aux32, bits, sizeof(uint32_t));
__m256i
bytes
=
_mm256_shuffle_epi8
(
_mm256_set1_epi32
(
aux32
),
shuffle
);
bytes
=
_mm256_or_si256
(
bytes
,
mask
);
return
_mm256_cmpeq_epi8
(
bytes
,
minus1
);
}
};
struct
Q5_0_Dequantizer
{
Dequantizer4bit
b4
;
HBitDequantizer
hbit
;
const
__m256i
mh
=
_mm256_set1_epi8
((
char
)
0xF0
);
inline
__m256i
dequant
(
const
block_q5_0
*
x
)
const
{
const
__m256i
vqh
=
_mm256_andnot_si256
(
hbit
.
to_bytes
(
x
->
qh
),
mh
);
return
_mm256_or_si256
(
b4
.
dequant
(
x
->
qs
),
vqh
);
}
};
struct
Q5_1_Dequantizer
{
Dequantizer4bit
b4
;
HBitDequantizer
hbit
;
const
__m256i
mh
=
_mm256_set1_epi8
(
0x10
);
inline
__m256i
dequant
(
const
block_q5_1
*
x
)
const
{
const
__m256i
vqh
=
_mm256_and_si256
(
hbit
.
to_bytes
(
x
->
qh
),
mh
);
return
_mm256_or_si256
(
b4
.
dequant
(
x
->
qs
),
vqh
);
}
};
template
<
typename
Q
,
typename
Scales
,
typename
Dequantizer
>
struct
Q_Unpacker
{
Q_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
cx_0
((
const
char
*
)
vx
),
x
((
const
Q
*
)
cx_0
),
bx
(
bx
)
{}
const
char
*
cx_0
;
const
Q
*
x
;
size_t
bx
;
Scales
scales
;
Dequantizer
deq
;
__m256i
qx
[
4
];
inline
const
__m256i
*
quants
()
const
{
return
qx
;
}
inline
void
set_row
(
int
ix
)
{
x
=
(
const
Q
*
)(
cx_0
+
ix
*
bx
);
}
inline
auto
set_block_4
(
int
i
)
{
for
(
int
j
=
0
;
j
<
4
;
++
j
)
{
qx
[
j
]
=
deq
.
dequant
(
x
+
4
*
i
+
j
);
}
return
scales
.
prepare4
(
x
+
4
*
i
);
}
inline
auto
set_block
(
int
i
)
{
qx
[
0
]
=
deq
.
dequant
(
x
+
i
);
return
scales
.
prepare1
(
x
+
i
);
}
};
struct
Q8_0_Unpacker
final
:
public
Q_Unpacker
<
block_q8_0
,
ScaleHelperQ_0
,
Q8_0_Dequantizer
>
{
Q8_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
};
struct
Q4_0_Unpacker
final
:
public
Q_Unpacker
<
block_q4_0
,
ScaleHelperQ_0
,
Q4_0_Dequantizer
>
{
Q4_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_0
;
}
};
struct
Q5_0_Unpacker
final
:
public
Q_Unpacker
<
block_q5_0
,
ScaleHelperQ_0
,
Q5_0_Dequantizer
>
{
Q5_0_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK5_0
;
}
};
struct
Q4_1_Unpacker
final
:
public
Q_Unpacker
<
block_q4_1
,
ScaleHelperQ_1
,
Q4_1_Dequantizer
>
{
Q4_1_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_1
;
}
};
struct
Q5_1_Unpacker
final
:
public
Q_Unpacker
<
block_q5_1
,
ScaleHelperQ_1
,
Q5_1_Dequantizer
>
{
Q5_1_Unpacker
(
const
void
*
vx
,
size_t
bx
)
:
Q_Unpacker
(
vx
,
bx
)
{}
inline
static
int
block_size
()
{
return
QK4_1
;
}
};
template
<
int
nrc_y
>
void
mul_mat_q8_0_q8_0_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
Q8_0_Unpacker
::
block_size
()
==
0
);
Q8
<
nrc_y
,
block_q8_0
>
q8
(
info
);
int
nb
=
n
/
Q8_0_Unpacker
::
block_size
();
if
(
nb
%
4
==
0
)
{
mul_mat_qX_q8_Helper
<
Q8_0_Unpacker
,
Sum4_Q8
,
AccumType0
<
nrc_y
,
true
>
,
ScaleHelperQ_0
,
block_q8_0
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
else
{
mul_mat_qX_q8_Helper
<
Q8_0_Unpacker
,
Sum4_Q8
,
AccumType0
<
nrc_y
,
false
>
,
ScaleHelperQ_0
,
block_q8_0
,
nrc_y
>
(
nb
,
vx
,
bx
,
info
,
q8
.
y
,
nrc_x
);
}
}
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_0_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_0_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0_T
<
Dequantizer
,
8
>
;
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
Q4_1_Unpacker
>
||
std
::
is_same_v
<
Dequantizer
,
Q5_1_Unpacker
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1_T
<
Dequantizer
,
8
>
;
}
else
{
#ifdef HAVE_FANCY_SIMD
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
8
>
;
#else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ2K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ3K
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ6K
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qY_K_q8_K_T
<
Dequantizer
,
8
>
;
}
else
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_T
<
Dequantizer
,
8
>
;
}
#endif
}
}
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
mm
,
int
&
row_size_q8
,
int
)
{
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerQ2K
>
(
mm
);
break
;
case
GGML_TYPE_Q3_K
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerQ3K
>
(
mm
);
break
;
case
GGML_TYPE_Q4_K
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerQ4K
>
(
mm
);
break
;
case
GGML_TYPE_Q5_K
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerQ5K
>
(
mm
);
break
;
case
GGML_TYPE_Q6_K
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerQ6K
>
(
mm
);
break
;
case
GGML_TYPE_IQ4_XS
:
assert
(
ne00
%
QK_K
==
0
);
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
mm
);
break
;
case
GGML_TYPE_Q4_0
:
assert
(
ne00
%
QK4_0
==
0
);
MulMat
::
set_functions
<
Q4_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
);
break
;
case
GGML_TYPE_Q4_1
:
assert
(
ne00
%
QK4_1
==
0
);
MulMat
::
set_functions
<
Q4_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
break
;
case
GGML_TYPE_Q5_0
:
assert
(
ne00
%
QK5_0
==
0
);
MulMat
::
set_functions
<
Q5_0_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
);
break
;
case
GGML_TYPE_Q5_1
:
assert
(
ne00
%
QK5_1
==
0
);
MulMat
::
set_functions
<
Q5_1_Unpacker
>
(
mm
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
break
;
default
:
return
false
;
}
return
true
;
}
}
// namespace
#else // __aarch64__
namespace
{
template
<
int
nrc
,
typename
block_q8
=
block_q8_K
>
struct
Q8
{
constexpr
static
int
nrc_y
=
nrc
;
Q8
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
y
[
iy
]
=
(
const
block_q8
*
)
info
.
src1_row
(
iy
);
}
inline
int8x16_t
load_quants_16
(
int
iy
,
int
i
,
int
j
)
const
{
return
vld1q_s8
(
y
[
iy
][
i
]
.
qs
+
16
*
j
);
}
inline
int8x16x2_t
load_quants
(
int
iy
,
int
i
,
int
j
)
const
{
return
vld1q_s8_x2
(
y
[
iy
][
i
]
.
qs
+
32
*
j
);
}
inline
int8x16x4_t
load_quants_64
(
int
iy
,
int
i
,
int
j
)
const
{
return
vld1q_s8_x4
(
y
[
iy
][
i
]
.
qs
+
64
*
j
);
}
inline
int16x8x2_t
load_bsums
(
int
iy
,
int
i
)
const
{
return
vld1q_s16_x2
(
y
[
iy
][
i
]
.
bsums
);
}
inline
int16x8_t
load_bsums8
(
int
iy
,
int
i
)
const
{
auto
q8s
=
vld1q_s16_x2
(
y
[
iy
][
i
]
.
bsums
);
return
vpaddq_s16
(
q8s
.
val
[
0
],
q8s
.
val
[
1
]);
}
inline
float
scale
(
int
iy
,
int
i
)
const
{
return
y
[
iy
][
i
]
.
d
;
}
const
block_q8
*
y
[
nrc_y
];
};
template
<
int
nrc_y
,
typename
Dequantizer
>
IQK_NOINLINE
void
mul_mat_qX_K_q8_K_T
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
,
block_q8_K
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
,
nrc_y
);
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq
.
new_row
(
ix
);
float32x4_t
acc
[
nrc_y
];
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
acc
[
iy
]
=
vdupq_n_f32
(
0.
f
);
//#pragma GCC unroll 4
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
int32x4_t
sumi
[
nrc_y
];
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
sumi
[
iy
]
=
vdupq_n_s32
(
0
);
if
constexpr
(
nrc_y
>
1
&&
Dequantizer
::
should_scale_quants
())
{
deq
.
process_scales
(
i
,
q8
,
acc
);
deq
.
prepare
(
i
,
0
);
deq
.
compute
(
q8
,
i
,
0
,
sumi
);
deq
.
prepare
(
i
,
1
);
deq
.
compute
(
q8
,
i
,
1
,
sumi
);
}
else
{
if
constexpr
(
Dequantizer
::
num_blocks
()
==
8
)
{
auto
scales
=
deq
.
new_block
(
i
,
q8
,
acc
);
deq
.
prepare
(
i
,
0
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_8_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
0
,
sumi
[
iy
]);
deq
.
prepare
(
i
,
1
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_8_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
1
,
sumi
[
iy
]);
}
else
if
constexpr
(
Dequantizer
::
num_blocks
()
==
16
)
{
auto
scales
=
deq
.
new_block
(
i
,
q8
,
acc
);
deq
.
prepare
(
i
,
0
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_16_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
0
,
sumi
[
iy
]);
deq
.
prepare
(
i
,
1
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_16_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
1
,
sumi
[
iy
]);
}
else
{
GGML_ASSERT
(
false
);
}
}
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
vcvtq_f32_s32
(
sumi
[
iy
]),
vdupq_n_f32
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)));
}
}
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
vaddvq_f32
(
acc
[
iy
]));
}
}
}
template
<
int
nrc_y
,
typename
Dequantizer
>
IQK_NOINLINE
void
mul_mat_qX_K_q8_K_IQ
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
,
block_q8_K
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
,
nrc_y
);
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq
.
new_row
(
ix
);
float32x4_t
acc
[
nrc_y
];
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
acc
[
iy
]
=
vdupq_n_f32
(
0.
f
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
int32x4_t
sumi
[
nrc_y
];
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
sumi
[
iy
]
=
vdupq_n_s32
(
0
);
if
constexpr
(
Dequantizer
::
num_blocks
()
==
8
)
{
auto
scales
=
deq
.
new_block
(
i
);
deq
.
prepare
(
i
,
0
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_8_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
0
,
sumi
[
iy
]);
deq
.
prepare
(
i
,
1
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_8_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
1
,
sumi
[
iy
]);
}
else
if
constexpr
(
Dequantizer
::
num_blocks
()
==
16
)
{
auto
scales
=
deq
.
new_block
(
i
);
deq
.
prepare
(
i
,
0
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_16_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
0
,
sumi
[
iy
]);
deq
.
prepare
(
i
,
1
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
compute_16_blocks
(
deq
.
bits
.
b1
,
deq
.
bits
.
b2
,
q8
,
scales
,
iy
,
i
,
1
,
sumi
[
iy
]);
}
else
{
GGML_ASSERT
(
false
);
}
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
vcvtq_f32_s32
(
sumi
[
iy
]),
vdupq_n_f32
(
deq
.
d
*
q8
.
scale
(
iy
,
i
)));
}
}
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
vaddvq_f32
(
acc
[
iy
]));
}
}
}
template
<
typename
Q8
>
IQK_ALWAYS_INLINE
void
compute_8_blocks
(
const
uint8x16x4_t
&
qx_1
,
const
uint8x16x4_t
&
qx_2
,
const
Q8
&
q8
,
const
int32x4x2_t
&
scales
,
int
iy
,
int
i
,
int
j
,
int32x4_t
&
sumi
)
{
auto
mzero
=
vdupq_n_s32
(
0
);
const
int8x16_t
*
qs_1
=
(
const
int8x16_t
*
)
qx_1
.
val
;
const
int8x16_t
*
qs_2
=
(
const
int8x16_t
*
)
qx_2
.
val
;
auto
q8b_1
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
0
);
auto
p1
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qs_1
[
0
],
q8b_1
.
val
[
0
]),
qs_1
[
1
],
q8b_1
.
val
[
1
]);
// block 1
auto
q8b_2
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
1
);
auto
p2
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qs_1
[
2
],
q8b_2
.
val
[
0
]),
qs_1
[
3
],
q8b_2
.
val
[
1
]);
// block 2
auto
p12
=
vpaddq_s32
(
p1
,
p2
);
auto
q8b_3
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
2
);
auto
p3
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qs_2
[
0
],
q8b_3
.
val
[
0
]),
qs_2
[
1
],
q8b_3
.
val
[
1
]);
// block 3
auto
q8b_4
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
3
);
auto
p4
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qs_2
[
2
],
q8b_4
.
val
[
0
]),
qs_2
[
3
],
q8b_4
.
val
[
1
]);
// block 4
auto
p34
=
vpaddq_s32
(
p3
,
p4
);
auto
pall
=
vpaddq_s32
(
p12
,
p34
);
sumi
=
vmlaq_s32
(
sumi
,
scales
.
val
[
j
],
pall
);
}
template
<
typename
Q8
>
IQK_ALWAYS_INLINE
void
compute_8_blocks
(
const
int8x16_t
*
qx
,
const
Q8
&
q8
,
const
int32x4_t
&
scales
,
int
iy
,
int
i
,
int
j
,
int32x4_t
&
sumi
)
{
auto
mzero
=
vdupq_n_s32
(
0
);
auto
q8b_1
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
0
);
auto
p1
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qx
[
0
],
q8b_1
.
val
[
0
]),
qx
[
1
],
q8b_1
.
val
[
1
]);
// block 1
auto
q8b_2
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
1
);
auto
p2
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qx
[
2
],
q8b_2
.
val
[
0
]),
qx
[
3
],
q8b_2
.
val
[
1
]);
// block 2
auto
p12
=
vpaddq_s32
(
p1
,
p2
);
auto
q8b_3
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
2
);
auto
p3
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qx
[
4
],
q8b_3
.
val
[
0
]),
qx
[
5
],
q8b_3
.
val
[
1
]);
// block 3
auto
q8b_4
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
3
);
auto
p4
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
mzero
,
qx
[
6
],
q8b_4
.
val
[
0
]),
qx
[
7
],
q8b_4
.
val
[
1
]);
// block 4
auto
p34
=
vpaddq_s32
(
p3
,
p4
);
auto
pall
=
vpaddq_s32
(
p12
,
p34
);
sumi
=
vmlaq_s32
(
sumi
,
scales
,
pall
);
}
template
<
typename
Q8
>
IQK_ALWAYS_INLINE
void
compute_16_blocks
(
const
uint8x16x4_t
&
qx_1
,
const
uint8x16x4_t
&
qx_2
,
const
Q8
&
q8
,
const
int32x4x4_t
&
scales
,
int
iy
,
int
i
,
int
j
,
int32x4_t
&
sumi
)
{
auto
mzero
=
vdupq_n_s32
(
0
);
auto
q8b_1
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
0
);
auto
p1
=
vpaddq_s32
(
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_1
.
val
[
0
]),
q8b_1
.
val
[
0
]),
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_1
.
val
[
1
]),
q8b_1
.
val
[
1
]));
// blocks 0, 0, 1, 1,
auto
q8b_2
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
1
);
auto
p2
=
vpaddq_s32
(
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_1
.
val
[
2
]),
q8b_2
.
val
[
0
]),
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_1
.
val
[
3
]),
q8b_2
.
val
[
1
]));
// blocks 3, 3, 4, 4,
auto
p12
=
vpaddq_s32
(
p1
,
p2
);
// blocks 0, 1, 2, 3
sumi
=
vmlaq_s32
(
sumi
,
scales
.
val
[
2
*
j
+
0
],
p12
);
auto
q8b_3
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
2
);
auto
p3
=
vpaddq_s32
(
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_2
.
val
[
0
]),
q8b_3
.
val
[
0
]),
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_2
.
val
[
1
]),
q8b_3
.
val
[
1
]));
// block 4, 4, 5, 5,
auto
q8b_4
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
3
);
auto
p4
=
vpaddq_s32
(
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_2
.
val
[
2
]),
q8b_4
.
val
[
0
]),
ggml_vdotq_s32
(
mzero
,
vreinterpretq_s8_u8
(
qx_2
.
val
[
3
]),
q8b_4
.
val
[
1
]));
// block 6, 6, 7, 7,
auto
p34
=
vpaddq_s32
(
p3
,
p4
);
// blocks 4, 5, 6, 7
sumi
=
vmlaq_s32
(
sumi
,
scales
.
val
[
2
*
j
+
1
],
p34
);
}
template
<
typename
Q8
>
inline
void
accum_mins_8
(
const
int16x8_t
&
mins
,
const
Q8
&
q8
,
float32x4_t
*
acc
,
int
i
,
float
c
)
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
auto
q8s
=
q8
.
load_bsums8
(
iy
,
i
);
int32x4_t
b1
=
vmull_s16
(
vget_low_s16
(
mins
),
vget_low_s16
(
q8s
));
int32x4_t
b2
=
vmull_s16
(
vget_high_s16
(
mins
),
vget_high_s16
(
q8s
));
float32x4_t
prod
=
vcvtq_f32_s32
(
vaddq_s32
(
b1
,
b2
));
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
prod
,
vdupq_n_f32
(
c
*
q8
.
scale
(
iy
,
i
)));
}
}
template
<
typename
Q8
>
inline
void
accum_mins_16
(
const
int16x8x2_t
&
mins
,
const
Q8
&
q8
,
float32x4_t
*
acc
,
int
i
,
float
c
)
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
auto
q8s
=
q8
.
load_bsums
(
iy
,
i
);
int32x4_t
b1
=
vmull_s16
(
vget_low_s16
(
mins
.
val
[
0
]),
vget_low_s16
(
q8s
.
val
[
0
]));
int32x4_t
b2
=
vmull_s16
(
vget_high_s16
(
mins
.
val
[
0
]),
vget_high_s16
(
q8s
.
val
[
0
]));
int32x4_t
b3
=
vmull_s16
(
vget_low_s16
(
mins
.
val
[
1
]),
vget_low_s16
(
q8s
.
val
[
1
]));
int32x4_t
b4
=
vmull_s16
(
vget_high_s16
(
mins
.
val
[
1
]),
vget_high_s16
(
q8s
.
val
[
1
]));
float32x4_t
prod
=
vcvtq_f32_s32
(
vaddq_s32
(
vaddq_s32
(
b1
,
b2
),
vaddq_s32
(
b3
,
b4
)));
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
prod
,
vdupq_n_f32
(
c
*
q8
.
scale
(
iy
,
i
)));
}
}
struct
Scales8
{
uint32_t
utmp
[
4
];
const
uint8_t
*
sc8
=
(
const
uint8_t
*
)
utmp
;
template
<
typename
Q8
,
typename
Qx
>
inline
int32x4x2_t
process_scales_mins
(
const
Qx
&
x
,
const
Q8
&
q8
,
int
i
,
float32x4_t
*
acc
)
{
make_q4_scales
(
x
.
scales
,
utmp
);
int16x8_t
mins
=
vmovl_s8
(
vld1_s8
((
const
int8_t
*
)
sc8
+
8
));
accum_mins_8
(
mins
,
q8
,
acc
,
i
,
-
GGML_FP16_TO_FP32
(
x
.
dmin
));
uint8x8_t
scales8
=
vld1_u8
(
sc8
);
uint16x8_t
scales16
=
vmovl_u8
(
scales8
);
int32x4x2_t
scales
=
{
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_low_u16
(
scales16
))),
vreinterpretq_s32_u32
(
vmovl_u16
(
vget_high_u16
(
scales16
)))};
return
scales
;
}
};
struct
Q4bits
{
const
uint8x16_t
m4b
=
vdupq_n_u8
(
0xf
);
uint8x16x4_t
b1
,
b2
;
inline
void
prepare4
(
uint8x16x4_t
&
b
,
const
uint8x16_t
*
val
)
const
{
b
.
val
[
0
]
=
vandq_u8
(
val
[
0
],
m4b
);
b
.
val
[
2
]
=
vshrq_n_u8
(
val
[
0
],
4
);
b
.
val
[
1
]
=
vandq_u8
(
val
[
1
],
m4b
);
b
.
val
[
3
]
=
vshrq_n_u8
(
val
[
1
],
4
);
}
inline
void
prepare4_16
(
uint8x16x4_t
&
b
,
const
uint8x16_t
*
val
)
const
{
b
.
val
[
0
]
=
vandq_u8
(
val
[
0
],
m4b
);
b
.
val
[
1
]
=
vshrq_n_u8
(
val
[
0
],
4
);
b
.
val
[
2
]
=
vandq_u8
(
val
[
1
],
m4b
);
b
.
val
[
3
]
=
vshrq_n_u8
(
val
[
1
],
4
);
}
inline
void
prepare
(
const
uint8_t
*
qs
)
{
auto
q4bits
=
vld1q_u8_x2
(
qs
);
prepare4
(
b1
,
q4bits
.
val
);
q4bits
=
vld1q_u8_x2
(
qs
+
32
);
prepare4
(
b2
,
q4bits
.
val
);
}
inline
void
prepare_v2
(
const
uint8_t
*
qs
)
{
auto
q4bits
=
vld1q_u8_x4
(
qs
);
prepare4
(
b1
,
q4bits
.
val
+
0
);
prepare4
(
b2
,
q4bits
.
val
+
2
);
}
inline
void
prepare64
(
const
uint8_t
*
qs
)
{
auto
q4bits
=
vld1q_u8_x4
(
qs
);
b1
.
val
[
0
]
=
vandq_u8
(
q4bits
.
val
[
0
],
m4b
);
b1
.
val
[
1
]
=
vandq_u8
(
q4bits
.
val
[
1
],
m4b
);
b1
.
val
[
2
]
=
vandq_u8
(
q4bits
.
val
[
2
],
m4b
);
b1
.
val
[
3
]
=
vandq_u8
(
q4bits
.
val
[
3
],
m4b
);
b2
.
val
[
0
]
=
vshrq_n_u8
(
q4bits
.
val
[
0
],
4
);
b2
.
val
[
1
]
=
vshrq_n_u8
(
q4bits
.
val
[
1
],
4
);
b2
.
val
[
2
]
=
vshrq_n_u8
(
q4bits
.
val
[
2
],
4
);
b2
.
val
[
3
]
=
vshrq_n_u8
(
q4bits
.
val
[
3
],
4
);
}
inline
void
prepare16
(
const
uint8_t
*
qs
)
{
auto
q4bits
=
vld1q_u8_x2
(
qs
);
prepare4_16
(
b1
,
q4bits
.
val
);
q4bits
=
vld1q_u8_x2
(
qs
+
32
);
prepare4_16
(
b2
,
q4bits
.
val
);
}
inline
void
prepare16_v2
(
const
uint8_t
*
qs
)
{
auto
q4bits
=
vld1q_u8_x4
(
qs
);
prepare4_16
(
b1
,
q4bits
.
val
+
0
);
prepare4_16
(
b2
,
q4bits
.
val
+
2
);
}
};
struct
Q2bits
{
const
uint8x16_t
m4b
=
vdupq_n_u8
(
0x03
);
uint8x16x4_t
b1
,
b2
;
inline
void
prepare
(
const
uint8_t
*
qs
)
{
auto
q2bits
=
vld1q_u8_x2
(
qs
);
b1
.
val
[
0
]
=
vandq_u8
(
q2bits
.
val
[
0
],
m4b
);
b1
.
val
[
1
]
=
vandq_u8
(
q2bits
.
val
[
1
],
m4b
);
q2bits
.
val
[
0
]
=
vshrq_n_u8
(
q2bits
.
val
[
0
],
2
);
q2bits
.
val
[
1
]
=
vshrq_n_u8
(
q2bits
.
val
[
1
],
2
);
b1
.
val
[
2
]
=
vandq_u8
(
q2bits
.
val
[
0
],
m4b
);
b1
.
val
[
3
]
=
vandq_u8
(
q2bits
.
val
[
1
],
m4b
);
q2bits
.
val
[
0
]
=
vshrq_n_u8
(
q2bits
.
val
[
0
],
2
);
q2bits
.
val
[
1
]
=
vshrq_n_u8
(
q2bits
.
val
[
1
],
2
);
b2
.
val
[
0
]
=
vandq_u8
(
q2bits
.
val
[
0
],
m4b
);
b2
.
val
[
1
]
=
vandq_u8
(
q2bits
.
val
[
1
],
m4b
);
q2bits
.
val
[
0
]
=
vshrq_n_u8
(
q2bits
.
val
[
0
],
2
);
q2bits
.
val
[
1
]
=
vshrq_n_u8
(
q2bits
.
val
[
1
],
2
);
b2
.
val
[
2
]
=
vandq_u8
(
q2bits
.
val
[
0
],
m4b
);
b2
.
val
[
3
]
=
vandq_u8
(
q2bits
.
val
[
1
],
m4b
);
}
};
template
<
typename
block_q
>
struct
BaseDequantizer
{
BaseDequantizer
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
vx
(
vx
),
x
(
nullptr
),
bx
(
bx
),
nrc
(
nrc
)
{}
inline
void
new_row
(
int
ix
)
{
x
=
(
const
block_q
*
)((
const
char
*
)
vx
+
ix
*
bx
);
}
const
void
*
vx
;
const
block_q
*
x
;
const
size_t
bx
;
const
int
nrc
;
};
struct
DequantizerQ4K
final
:
public
BaseDequantizer
<
block_q4_K
>
{
DequantizerQ4K
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
8
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
template
<
typename
Q8
>
inline
int32x4x2_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
return
s8
.
process_scales_mins
(
x
[
i
],
q8
,
i
,
acc
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
if
(
nrc
==
1
)
bits
.
prepare_v2
(
x
[
i
]
.
qs
+
64
*
j
);
else
bits
.
prepare
(
x
[
i
]
.
qs
+
64
*
j
);
}
Q4bits
bits
;
Scales8
s8
;
float
d
;
};
struct
HighBit5
{
const
uint8x16_t
mhb
=
vdupq_n_u8
(
0x10
);
uint8x16x2_t
bits
;
inline
void
apply
(
uint8x16x4_t
&
b1
,
uint8x16x4_t
&
b2
,
bool
do_shift
)
{
b1
.
val
[
0
]
=
vorrq_u8
(
b1
.
val
[
0
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
4
),
mhb
));
b1
.
val
[
1
]
=
vorrq_u8
(
b1
.
val
[
1
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
4
),
mhb
));
b1
.
val
[
2
]
=
vorrq_u8
(
b1
.
val
[
2
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
3
),
mhb
));
b1
.
val
[
3
]
=
vorrq_u8
(
b1
.
val
[
3
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
3
),
mhb
));
b2
.
val
[
0
]
=
vorrq_u8
(
b2
.
val
[
0
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
2
),
mhb
));
b2
.
val
[
1
]
=
vorrq_u8
(
b2
.
val
[
1
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
2
),
mhb
));
b2
.
val
[
2
]
=
vorrq_u8
(
b2
.
val
[
2
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
1
),
mhb
));
b2
.
val
[
3
]
=
vorrq_u8
(
b2
.
val
[
3
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
1
),
mhb
));
if
(
do_shift
)
{
bits
.
val
[
0
]
=
vshrq_n_u8
(
bits
.
val
[
0
],
4
);
bits
.
val
[
1
]
=
vshrq_n_u8
(
bits
.
val
[
1
],
4
);
}
}
};
struct
HighBit3
{
const
uint8x16_t
mhb
=
vdupq_n_u8
(
0x04
);
uint8x16x2_t
bits
;
inline
void
apply
(
uint8x16x4_t
&
b1
,
uint8x16x4_t
&
b2
,
bool
do_shift
)
{
b1
.
val
[
0
]
=
vorrq_u8
(
b1
.
val
[
0
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
2
),
mhb
));
b1
.
val
[
1
]
=
vorrq_u8
(
b1
.
val
[
1
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
2
),
mhb
));
b1
.
val
[
2
]
=
vorrq_u8
(
b1
.
val
[
2
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
0
],
1
),
mhb
));
b1
.
val
[
3
]
=
vorrq_u8
(
b1
.
val
[
3
],
vandq_u8
(
vshlq_n_u8
(
bits
.
val
[
1
],
1
),
mhb
));
b2
.
val
[
0
]
=
vorrq_u8
(
b2
.
val
[
0
],
vandq_u8
(
bits
.
val
[
0
],
mhb
));
b2
.
val
[
1
]
=
vorrq_u8
(
b2
.
val
[
1
],
vandq_u8
(
bits
.
val
[
1
],
mhb
));
b2
.
val
[
2
]
=
vorrq_u8
(
b2
.
val
[
2
],
vandq_u8
(
vshrq_n_u8
(
bits
.
val
[
0
],
1
),
mhb
));
b2
.
val
[
3
]
=
vorrq_u8
(
b2
.
val
[
3
],
vandq_u8
(
vshrq_n_u8
(
bits
.
val
[
1
],
1
),
mhb
));
if
(
do_shift
)
{
bits
.
val
[
0
]
=
vshrq_n_u8
(
bits
.
val
[
0
],
4
);
bits
.
val
[
1
]
=
vshrq_n_u8
(
bits
.
val
[
1
],
4
);
}
}
};
struct
DequantizerQ5K
final
:
public
BaseDequantizer
<
block_q5_K
>
{
DequantizerQ5K
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
8
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
template
<
typename
Q8
>
inline
int32x4x2_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
h
.
bits
=
vld1q_u8_x2
(
x
[
i
]
.
qh
);
return
s8
.
process_scales_mins
(
x
[
i
],
q8
,
i
,
acc
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
+
64
*
j
);
h
.
apply
(
bits
.
b1
,
bits
.
b2
,
j
==
0
);
}
Q4bits
bits
;
HighBit5
h
;
Scales8
s8
;
uint8x16x2_t
hbits
;
float
d
;
};
inline
int32x4x4_t
make_wider
(
const
int16x8x2_t
&
scales16
)
{
int32x4x4_t
scales
=
{
vmovl_s16
(
vget_low_s16
(
scales16
.
val
[
0
])),
vmovl_s16
(
vget_high_s16
(
scales16
.
val
[
0
])),
vmovl_s16
(
vget_low_s16
(
scales16
.
val
[
1
])),
vmovl_s16
(
vget_high_s16
(
scales16
.
val
[
1
])),
};
return
scales
;
}
template
<
typename
Q8
>
inline
int32x4x4_t
process_scales_mins_16
(
const
int8x16_t
&
scales8
,
const
Q8
&
q8
,
float32x4_t
*
acc
,
int
i
,
float
c
)
{
int16x8x2_t
scales16
;
scales16
.
val
[
0
]
=
vmovl_s8
(
vget_low_s8
(
scales8
));
scales16
.
val
[
1
]
=
vmovl_s8
(
vget_high_s8
(
scales8
));
accum_mins_16
(
scales16
,
q8
,
acc
,
i
,
c
);
return
make_wider
(
scales16
);
}
struct
DequantizerQ6K
final
:
public
BaseDequantizer
<
block_q6_K
>
{
DequantizerQ6K
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
16
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
template
<
typename
Q8
>
inline
int32x4x4_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
return
process_scales_mins_16
(
vld1q_s8
(
x
[
i
]
.
scales
),
q8
,
acc
,
i
,
-
32.
f
*
d
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
auto
hbits
=
vld1q_u8_x2
(
x
[
i
]
.
qh
+
32
*
j
);
bits
.
prepare64
(
x
[
i
]
.
ql
+
64
*
j
);
bits
.
b1
.
val
[
0
]
=
vorrq_u8
(
bits
.
b1
.
val
[
0
],
vandq_u8
(
vshlq_n_u8
(
hbits
.
val
[
0
],
4
),
mhb
));
bits
.
b1
.
val
[
1
]
=
vorrq_u8
(
bits
.
b1
.
val
[
1
],
vandq_u8
(
vshlq_n_u8
(
hbits
.
val
[
1
],
4
),
mhb
));
bits
.
b1
.
val
[
2
]
=
vorrq_u8
(
bits
.
b1
.
val
[
2
],
vandq_u8
(
vshlq_n_u8
(
hbits
.
val
[
0
],
2
),
mhb
));
bits
.
b1
.
val
[
3
]
=
vorrq_u8
(
bits
.
b1
.
val
[
3
],
vandq_u8
(
vshlq_n_u8
(
hbits
.
val
[
1
],
2
),
mhb
));
bits
.
b2
.
val
[
0
]
=
vorrq_u8
(
bits
.
b2
.
val
[
0
],
vandq_u8
(
hbits
.
val
[
0
],
mhb
));
bits
.
b2
.
val
[
1
]
=
vorrq_u8
(
bits
.
b2
.
val
[
1
],
vandq_u8
(
hbits
.
val
[
1
],
mhb
));
bits
.
b2
.
val
[
2
]
=
vorrq_u8
(
bits
.
b2
.
val
[
2
],
vandq_u8
(
vshrq_n_u8
(
hbits
.
val
[
0
],
2
),
mhb
));
bits
.
b2
.
val
[
3
]
=
vorrq_u8
(
bits
.
b2
.
val
[
3
],
vandq_u8
(
vshrq_n_u8
(
hbits
.
val
[
1
],
2
),
mhb
));
}
Q4bits
bits
;
const
uint8x16_t
mhb
=
vdupq_n_u8
(
0x30
);
float
d
;
};
struct
DequantizerQ3K
final
:
public
BaseDequantizer
<
block_q3_K
>
{
DequantizerQ3K
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
16
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
template
<
typename
Q8
>
inline
int32x4x4_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
h
.
bits
=
vld1q_u8_x2
(
x
[
i
]
.
hmask
);
const
uint16_t
*
sc16
=
(
const
uint16_t
*
)
x
[
i
]
.
scales
;
uint32_t
aux0
=
sc16
[
0
]
|
(
sc16
[
1
]
<<
16
);
uint32_t
aux1
=
sc16
[
2
]
|
(
sc16
[
3
]
<<
16
);
uint32_t
aux2
=
sc16
[
4
]
|
(
sc16
[
5
]
<<
16
);
aux32
[
0
]
=
(
aux0
&
0x0f0f0f0f
)
|
((
aux2
<<
4
)
&
0x30303030
);
aux32
[
1
]
=
(
aux1
&
0x0f0f0f0f
)
|
((
aux2
<<
2
)
&
0x30303030
);
aux32
[
2
]
=
((
aux0
>>
4
)
&
0x0f0f0f0f
)
|
((
aux2
>>
0
)
&
0x30303030
);
aux32
[
3
]
=
((
aux1
>>
4
)
&
0x0f0f0f0f
)
|
((
aux2
>>
2
)
&
0x30303030
);
return
process_scales_mins_16
(
vaddq_s8
(
vld1q_s8
((
const
int8_t
*
)
aux32
),
vdupq_n_s8
(
-
32
)),
q8
,
acc
,
i
,
-
4.
f
*
d
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
+
32
*
j
);
h
.
apply
(
bits
.
b1
,
bits
.
b2
,
j
==
0
);
}
uint32_t
aux32
[
4
];
Q2bits
bits
;
HighBit3
h
;
float
d
;
};
struct
DequantizerQ2K
final
:
public
BaseDequantizer
<
block_q2_K
>
{
DequantizerQ2K
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
16
;
}
constexpr
static
bool
should_scale_quants
()
{
return
true
;
}
template
<
typename
Q8
>
inline
void
process_scales
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
auto
scales_and_mins
=
vld1q_u8
(
x
[
i
]
.
scales
);
auto
mins8
=
vreinterpretq_s8_u8
(
vshrq_n_u8
(
scales_and_mins
,
4
));
int16x8x2_t
scales16
;
scales16
.
val
[
0
]
=
vmovl_s8
(
vget_low_s8
(
mins8
));
scales16
.
val
[
1
]
=
vmovl_s8
(
vget_high_s8
(
mins8
));
accum_mins_16
(
scales16
,
q8
,
acc
,
i
,
-
GGML_FP16_TO_FP32
(
x
[
i
]
.
dmin
));
scales8
=
vandq_u8
(
scales_and_mins
,
vdupq_n_u8
(
0xf
));
}
template
<
typename
Q8
>
inline
int32x4x4_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
process_scales
(
i
,
q8
,
acc
);
int16x8x2_t
scales16
;
scales16
.
val
[
0
]
=
vmovl_s8
(
vget_low_s8
(
vreinterpretq_s8_u8
(
scales8
)));
scales16
.
val
[
1
]
=
vmovl_s8
(
vget_high_s8
(
vreinterpretq_s8_u8
(
scales8
)));
return
make_wider
(
scales16
);
}
template
<
typename
Q8
>
inline
void
compute
(
const
Q8
&
q8
,
int
i
,
int
j
,
int32x4_t
*
sumi
)
{
auto
m1
=
vdupq_n_u8
(
1
);
auto
shuffle
=
vdupq_n_u8
(
8
*
j
);
bits
.
b1
.
val
[
0
]
=
vmulq_u8
(
bits
.
b1
.
val
[
0
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b1
.
val
[
1
]
=
vmulq_u8
(
bits
.
b1
.
val
[
1
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b1
.
val
[
2
]
=
vmulq_u8
(
bits
.
b1
.
val
[
2
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b1
.
val
[
3
]
=
vmulq_u8
(
bits
.
b1
.
val
[
3
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b2
.
val
[
0
]
=
vmulq_u8
(
bits
.
b2
.
val
[
0
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b2
.
val
[
1
]
=
vmulq_u8
(
bits
.
b2
.
val
[
1
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b2
.
val
[
2
]
=
vmulq_u8
(
bits
.
b2
.
val
[
2
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
bits
.
b2
.
val
[
3
]
=
vmulq_u8
(
bits
.
b2
.
val
[
3
],
vqtbl1q_u8
(
scales8
,
shuffle
));
shuffle
=
vaddq_u8
(
shuffle
,
m1
);
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
auto
q8b_1
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
0
);
sumi
[
iy
]
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
sumi
[
iy
],
vreinterpretq_s8_u8
(
bits
.
b1
.
val
[
0
]),
q8b_1
.
val
[
0
]),
vreinterpretq_s8_u8
(
bits
.
b1
.
val
[
1
]),
q8b_1
.
val
[
1
]);
auto
q8b_2
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
1
);
sumi
[
iy
]
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
sumi
[
iy
],
vreinterpretq_s8_u8
(
bits
.
b1
.
val
[
2
]),
q8b_2
.
val
[
0
]),
vreinterpretq_s8_u8
(
bits
.
b1
.
val
[
3
]),
q8b_2
.
val
[
1
]);
auto
q8b_3
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
2
);
sumi
[
iy
]
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
sumi
[
iy
],
vreinterpretq_s8_u8
(
bits
.
b2
.
val
[
0
]),
q8b_3
.
val
[
0
]),
vreinterpretq_s8_u8
(
bits
.
b2
.
val
[
1
]),
q8b_3
.
val
[
1
]);
auto
q8b_4
=
q8
.
load_quants
(
iy
,
i
,
4
*
j
+
3
);
sumi
[
iy
]
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
sumi
[
iy
],
vreinterpretq_s8_u8
(
bits
.
b2
.
val
[
2
]),
q8b_4
.
val
[
0
]),
vreinterpretq_s8_u8
(
bits
.
b2
.
val
[
3
]),
q8b_4
.
val
[
1
]);
}
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare
(
x
[
i
]
.
qs
+
32
*
j
);
}
uint32_t
aux32
[
4
];
uint8x16_t
scales8
;
Q2bits
bits
;
float
d
;
};
// ============================= i-quants
struct
DequantizerIQ4XS
final
:
public
BaseDequantizer
<
block_iq4_xs
>
{
static
int8x16_t
load_values
()
{
static
const
int8_t
iq4nl_values
[
16
]
=
{
-
127
,
-
104
,
-
83
,
-
65
,
-
49
,
-
35
,
-
22
,
-
10
,
1
,
13
,
25
,
38
,
53
,
69
,
89
,
113
};
return
vld1q_s8
(
iq4nl_values
);
}
DequantizerIQ4XS
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
),
values
(
load_values
())
{}
constexpr
static
int
num_blocks
()
{
return
8
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
inline
void
new_row
(
int
ix
)
{
x
=
(
const
block_iq4_xs
*
)((
const
char
*
)
vx
+
bx
*
ix
);
}
template
<
typename
Q8
>
inline
int32x4x2_t
new_block
(
int
i
,
const
Q8
&
q8
,
float32x4_t
*
acc
)
{
(
void
)
q8
;
(
void
)
acc
;
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
const
uint16_t
scales_h
=
x
[
i
]
.
scales_h
;
const
uint16_t
*
scales_l
=
(
const
uint16_t
*
)
x
[
i
]
.
scales_l
;
aux32
[
0
]
=
scales_l
[
0
]
|
(
scales_l
[
1
]
<<
16
);
aux32
[
1
]
=
aux32
[
0
]
>>
4
;
// scl is ordered as 0, 2, 4, 6, 1, 3, 5, 7
uint8x8_t
scl8
=
vand_u8
(
vld1_u8
((
const
uint8_t
*
)
aux32
),
vdup_n_u8
(
0xf
));
uint16_t
*
aux16
=
(
uint16_t
*
)
aux32
;
aux16
[
0
]
=
scales_h
<<
4
;
aux16
[
1
]
=
scales_h
<<
2
;
aux16
[
2
]
=
scales_h
;
aux16
[
3
]
=
scales_h
>>
2
;
// sch is ordered as 0, 4, 1, 5, 2, 6, 3, 7
uint8x8_t
sch8
=
vand_u8
(
vld1_u8
((
const
uint8_t
*
)
aux16
),
vdup_n_u8
(
0x30
));
int8x8_t
scales8
=
vadd_s8
(
vreinterpret_s8_u8
(
vorr_u8
(
scl8
,
vtbl1_u8
(
sch8
,
vreinterpret_u8_u32
(
hshuff
)))),
vdup_n_s8
(
-
32
));
// shuffle 0, 2, 4, 6, 1, 3, 5, 7 -> 0, 1, 2, 3, 4, 5, 6, 7
scales8
=
vtbl1_s8
(
scales8
,
vreinterpret_s8_u32
(
hshuff
));
int16x8_t
scales16
=
vmovl_s8
(
scales8
);
int32x4x2_t
scales
=
{
vmovl_s16
(
vget_low_s16
(
scales16
)),
vmovl_s16
(
vget_high_s16
(
scales16
))};
return
scales
;
}
inline
void
prepare
(
int
i
,
int
j
)
{
bits
.
prepare16
(
x
[
i
]
.
qs
+
64
*
j
);
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
bits
.
b1
.
val
[
k
]
=
vreinterpretq_u8_s8
(
vqtbl1q_s8
(
values
,
bits
.
b1
.
val
[
k
]));
bits
.
b2
.
val
[
k
]
=
vreinterpretq_u8_s8
(
vqtbl1q_s8
(
values
,
bits
.
b2
.
val
[
k
]));
}
}
Q4bits
bits
;
const
int8x16_t
values
;
uint32_t
aux32
[
2
];
constexpr
static
uint32x2_t
hshuff
=
{
0x05010400
,
0x07030602
};
float
d
;
};
struct
SimpleBits
{
uint8x16x4_t
b1
;
uint8x16x4_t
b2
;
};
IQK_ALWAYS_INLINE
int32x4x2_t
prepare_scales_8
(
const
uint32x4_t
&
v1
,
const
uint32x4_t
&
v2
)
{
int32x4x2_t
scales
;
auto
one
=
vdupq_n_u32
(
1
);
scales
.
val
[
0
]
=
vreinterpretq_s32_u32
(
vsliq_n_u32
(
one
,
vshrq_n_u32
(
v1
,
28
),
1
));
scales
.
val
[
1
]
=
vreinterpretq_s32_u32
(
vsliq_n_u32
(
one
,
vshrq_n_u32
(
v2
,
28
),
1
));
return
scales
;
}
inline
void
apply_signs_2
(
uint8x16_t
*
b
,
const
uint64_t
*
signs
,
uint32_t
sidx
)
{
auto
s1
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
signs
+
((
sidx
>>
0
)
&
127
))),
vld1_s8
((
const
int8_t
*
)(
signs
+
((
sidx
>>
7
)
&
127
))));
auto
s2
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
signs
+
((
sidx
>>
14
)
&
127
))),
vld1_s8
((
const
int8_t
*
)(
signs
+
((
sidx
>>
21
)
&
127
))));
b
[
0
]
=
vreinterpretq_u8_s8
(
vmulq_s8
(
vreinterpretq_s8_u8
(
b
[
0
]),
s1
));
b
[
1
]
=
vreinterpretq_u8_s8
(
vmulq_s8
(
vreinterpretq_s8_u8
(
b
[
1
]),
s2
));
}
IQK_ALWAYS_INLINE
int32x4_t
prepare_scales_8
(
const
uint32x4_t
&
v1
)
{
return
vreinterpretq_s32_u32
(
vsliq_n_u32
(
vdupq_n_u32
(
1
),
vshrq_n_u32
(
v1
,
28
),
1
));
}
struct
DequantizerIQ2XXS
final
:
public
BaseDequantizer
<
block_iq2_xxs
>
{
DequantizerIQ2XXS
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
IQK_ALWAYS_INLINE
float
new_block
(
int
i
)
const
{
return
0.125
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
}
inline
int32x4_t
unpack
(
int
i
,
int
j
,
uint8x16_t
*
q
)
const
{
auto
data
=
vld1q_u32_x2
((
const
uint32_t
*
)(
x
[
i
]
.
qs
+
16
*
j
));
prepare_all
(
data
,
q
);
return
prepare_scales_8
(
vuzp2q_u32
(
data
.
val
[
0
],
data
.
val
[
1
]));
}
private
:
static
inline
void
prepare2
(
uint8x16_t
*
b
,
const
uint32_t
*
bits
,
const
uint64_t
*
signs
)
{
const
uint8_t
*
idx
=
(
const
uint8_t
*
)
bits
;
b
[
0
]
=
vreinterpretq_u8_u64
(
uint64x2_t
{
iq2xxs_grid
[
idx
[
0
]],
iq2xxs_grid
[
idx
[
1
]]});
b
[
1
]
=
vreinterpretq_u8_u64
(
uint64x2_t
{
iq2xxs_grid
[
idx
[
2
]],
iq2xxs_grid
[
idx
[
3
]]});
apply_signs_2
(
b
,
signs
,
bits
[
1
]);
}
inline
static
void
prepare_all
(
const
uint32x4x2_t
&
data
,
uint8x16_t
*
quants
)
{
const
uint32_t
*
q2
=
(
const
uint32_t
*
)
data
.
val
;
prepare2
(
quants
+
0
,
q2
+
0
,
keven_signs
);
prepare2
(
quants
+
2
,
q2
+
2
,
keven_signs
);
prepare2
(
quants
+
4
,
q2
+
4
,
keven_signs
);
prepare2
(
quants
+
6
,
q2
+
6
,
keven_signs
);
}
};
inline
int32x4x4_t
prepare_4bit_scales16
(
const
uint8_t
*
sc
)
{
auto
aux
=
vld1_u8
(
sc
);
auto
scales_l
=
vand_u8
(
aux
,
vdup_n_u8
(
0xf
));
auto
scales_h
=
vshr_n_u8
(
aux
,
4
);
auto
aux1
=
vcombine_u8
(
vzip1_u8
(
scales_l
,
scales_h
),
vzip2_u8
(
scales_l
,
scales_h
));
auto
scales8
=
vreinterpretq_s8_u8
(
vorrq_u8
(
vshlq_n_u8
(
aux1
,
1
),
vdupq_n_u8
(
1
)));
int16x8x2_t
scales16
=
{
vmovl_s8
(
vget_low_s8
(
scales8
)),
vmovl_s8
(
vget_high_s8
(
scales8
))
};
return
make_wider
(
scales16
);
}
struct
DequantizerIQ2XS
final
:
public
BaseDequantizer
<
block_iq2_xs
>
{
DequantizerIQ2XS
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
16
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
SimpleBits
bits
;
float
d
;
inline
int32x4x4_t
new_block
(
int
i
)
{
d
=
0.125
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
prepare_internal
(
i
,
0
);
return
prepare_4bit_scales16
(
x
[
i
]
.
scales
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
if
(
j
==
1
)
prepare_internal
(
i
,
1
);
}
private
:
static
void
make2
(
const
uint16_t
*
qs
,
uint8x16_t
*
b
)
{
auto
v1
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
iq2xs_grid
+
(
qs
[
0
]
&
511
))),
vld1_s8
((
const
int8_t
*
)(
iq2xs_grid
+
(
qs
[
1
]
&
511
))));
auto
v2
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
iq2xs_grid
+
(
qs
[
2
]
&
511
))),
vld1_s8
((
const
int8_t
*
)(
iq2xs_grid
+
(
qs
[
3
]
&
511
))));
auto
s1
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
keven_signs
+
(
qs
[
0
]
>>
9
))),
vld1_s8
((
const
int8_t
*
)(
keven_signs
+
(
qs
[
1
]
>>
9
))));
auto
s2
=
vcombine_s8
(
vld1_s8
((
const
int8_t
*
)(
keven_signs
+
(
qs
[
2
]
>>
9
))),
vld1_s8
((
const
int8_t
*
)(
keven_signs
+
(
qs
[
3
]
>>
9
))));
b
[
0
]
=
vreinterpretq_u8_s8
(
vmulq_s8
(
v1
,
s1
));
b
[
1
]
=
vreinterpretq_u8_s8
(
vmulq_s8
(
v2
,
s2
));
}
inline
static
void
make4
(
const
uint16_t
*
qs
,
uint8x16_t
*
b
)
{
make2
(
qs
+
0
,
b
+
0
);
make2
(
qs
+
4
,
b
+
2
);
}
IQK_ALWAYS_INLINE
void
prepare_internal
(
int
i
,
int
j
)
{
make4
(
x
[
i
]
.
qs
+
16
*
j
+
0
,
bits
.
b1
.
val
);
make4
(
x
[
i
]
.
qs
+
16
*
j
+
8
,
bits
.
b2
.
val
);
}
};
// So, I hate to include this table, but with the GCC 12.3 compiler
// bundled in the Cosmopolitan tools, loading the unpacked sign bytes
// from this table using the packed 8 sign bits as index is faster than
// using the standard trick of vceqq_u8(vandq_u8(bits, mask), mask) to
// expand the bits to bytes.
static
const
uint64_t
kall_signs
[
256
]
=
{
0x0101010101010101
,
0x01010101010101ff
,
0x010101010101ff01
,
0x010101010101ffff
,
0x0101010101ff0101
,
0x0101010101ff01ff
,
0x0101010101ffff01
,
0x0101010101ffffff
,
0x01010101ff010101
,
0x01010101ff0101ff
,
0x01010101ff01ff01
,
0x01010101ff01ffff
,
0x01010101ffff0101
,
0x01010101ffff01ff
,
0x01010101ffffff01
,
0x01010101ffffffff
,
0x010101ff01010101
,
0x010101ff010101ff
,
0x010101ff0101ff01
,
0x010101ff0101ffff
,
0x010101ff01ff0101
,
0x010101ff01ff01ff
,
0x010101ff01ffff01
,
0x010101ff01ffffff
,
0x010101ffff010101
,
0x010101ffff0101ff
,
0x010101ffff01ff01
,
0x010101ffff01ffff
,
0x010101ffffff0101
,
0x010101ffffff01ff
,
0x010101ffffffff01
,
0x010101ffffffffff
,
0x0101ff0101010101
,
0x0101ff01010101ff
,
0x0101ff010101ff01
,
0x0101ff010101ffff
,
0x0101ff0101ff0101
,
0x0101ff0101ff01ff
,
0x0101ff0101ffff01
,
0x0101ff0101ffffff
,
0x0101ff01ff010101
,
0x0101ff01ff0101ff
,
0x0101ff01ff01ff01
,
0x0101ff01ff01ffff
,
0x0101ff01ffff0101
,
0x0101ff01ffff01ff
,
0x0101ff01ffffff01
,
0x0101ff01ffffffff
,
0x0101ffff01010101
,
0x0101ffff010101ff
,
0x0101ffff0101ff01
,
0x0101ffff0101ffff
,
0x0101ffff01ff0101
,
0x0101ffff01ff01ff
,
0x0101ffff01ffff01
,
0x0101ffff01ffffff
,
0x0101ffffff010101
,
0x0101ffffff0101ff
,
0x0101ffffff01ff01
,
0x0101ffffff01ffff
,
0x0101ffffffff0101
,
0x0101ffffffff01ff
,
0x0101ffffffffff01
,
0x0101ffffffffffff
,
0x01ff010101010101
,
0x01ff0101010101ff
,
0x01ff01010101ff01
,
0x01ff01010101ffff
,
0x01ff010101ff0101
,
0x01ff010101ff01ff
,
0x01ff010101ffff01
,
0x01ff010101ffffff
,
0x01ff0101ff010101
,
0x01ff0101ff0101ff
,
0x01ff0101ff01ff01
,
0x01ff0101ff01ffff
,
0x01ff0101ffff0101
,
0x01ff0101ffff01ff
,
0x01ff0101ffffff01
,
0x01ff0101ffffffff
,
0x01ff01ff01010101
,
0x01ff01ff010101ff
,
0x01ff01ff0101ff01
,
0x01ff01ff0101ffff
,
0x01ff01ff01ff0101
,
0x01ff01ff01ff01ff
,
0x01ff01ff01ffff01
,
0x01ff01ff01ffffff
,
0x01ff01ffff010101
,
0x01ff01ffff0101ff
,
0x01ff01ffff01ff01
,
0x01ff01ffff01ffff
,
0x01ff01ffffff0101
,
0x01ff01ffffff01ff
,
0x01ff01ffffffff01
,
0x01ff01ffffffffff
,
0x01ffff0101010101
,
0x01ffff01010101ff
,
0x01ffff010101ff01
,
0x01ffff010101ffff
,
0x01ffff0101ff0101
,
0x01ffff0101ff01ff
,
0x01ffff0101ffff01
,
0x01ffff0101ffffff
,
0x01ffff01ff010101
,
0x01ffff01ff0101ff
,
0x01ffff01ff01ff01
,
0x01ffff01ff01ffff
,
0x01ffff01ffff0101
,
0x01ffff01ffff01ff
,
0x01ffff01ffffff01
,
0x01ffff01ffffffff
,
0x01ffffff01010101
,
0x01ffffff010101ff
,
0x01ffffff0101ff01
,
0x01ffffff0101ffff
,
0x01ffffff01ff0101
,
0x01ffffff01ff01ff
,
0x01ffffff01ffff01
,
0x01ffffff01ffffff
,
0x01ffffffff010101
,
0x01ffffffff0101ff
,
0x01ffffffff01ff01
,
0x01ffffffff01ffff
,
0x01ffffffffff0101
,
0x01ffffffffff01ff
,
0x01ffffffffffff01
,
0x01ffffffffffffff
,
0xff01010101010101
,
0xff010101010101ff
,
0xff0101010101ff01
,
0xff0101010101ffff
,
0xff01010101ff0101
,
0xff01010101ff01ff
,
0xff01010101ffff01
,
0xff01010101ffffff
,
0xff010101ff010101
,
0xff010101ff0101ff
,
0xff010101ff01ff01
,
0xff010101ff01ffff
,
0xff010101ffff0101
,
0xff010101ffff01ff
,
0xff010101ffffff01
,
0xff010101ffffffff
,
0xff0101ff01010101
,
0xff0101ff010101ff
,
0xff0101ff0101ff01
,
0xff0101ff0101ffff
,
0xff0101ff01ff0101
,
0xff0101ff01ff01ff
,
0xff0101ff01ffff01
,
0xff0101ff01ffffff
,
0xff0101ffff010101
,
0xff0101ffff0101ff
,
0xff0101ffff01ff01
,
0xff0101ffff01ffff
,
0xff0101ffffff0101
,
0xff0101ffffff01ff
,
0xff0101ffffffff01
,
0xff0101ffffffffff
,
0xff01ff0101010101
,
0xff01ff01010101ff
,
0xff01ff010101ff01
,
0xff01ff010101ffff
,
0xff01ff0101ff0101
,
0xff01ff0101ff01ff
,
0xff01ff0101ffff01
,
0xff01ff0101ffffff
,
0xff01ff01ff010101
,
0xff01ff01ff0101ff
,
0xff01ff01ff01ff01
,
0xff01ff01ff01ffff
,
0xff01ff01ffff0101
,
0xff01ff01ffff01ff
,
0xff01ff01ffffff01
,
0xff01ff01ffffffff
,
0xff01ffff01010101
,
0xff01ffff010101ff
,
0xff01ffff0101ff01
,
0xff01ffff0101ffff
,
0xff01ffff01ff0101
,
0xff01ffff01ff01ff
,
0xff01ffff01ffff01
,
0xff01ffff01ffffff
,
0xff01ffffff010101
,
0xff01ffffff0101ff
,
0xff01ffffff01ff01
,
0xff01ffffff01ffff
,
0xff01ffffffff0101
,
0xff01ffffffff01ff
,
0xff01ffffffffff01
,
0xff01ffffffffffff
,
0xffff010101010101
,
0xffff0101010101ff
,
0xffff01010101ff01
,
0xffff01010101ffff
,
0xffff010101ff0101
,
0xffff010101ff01ff
,
0xffff010101ffff01
,
0xffff010101ffffff
,
0xffff0101ff010101
,
0xffff0101ff0101ff
,
0xffff0101ff01ff01
,
0xffff0101ff01ffff
,
0xffff0101ffff0101
,
0xffff0101ffff01ff
,
0xffff0101ffffff01
,
0xffff0101ffffffff
,
0xffff01ff01010101
,
0xffff01ff010101ff
,
0xffff01ff0101ff01
,
0xffff01ff0101ffff
,
0xffff01ff01ff0101
,
0xffff01ff01ff01ff
,
0xffff01ff01ffff01
,
0xffff01ff01ffffff
,
0xffff01ffff010101
,
0xffff01ffff0101ff
,
0xffff01ffff01ff01
,
0xffff01ffff01ffff
,
0xffff01ffffff0101
,
0xffff01ffffff01ff
,
0xffff01ffffffff01
,
0xffff01ffffffffff
,
0xffffff0101010101
,
0xffffff01010101ff
,
0xffffff010101ff01
,
0xffffff010101ffff
,
0xffffff0101ff0101
,
0xffffff0101ff01ff
,
0xffffff0101ffff01
,
0xffffff0101ffffff
,
0xffffff01ff010101
,
0xffffff01ff0101ff
,
0xffffff01ff01ff01
,
0xffffff01ff01ffff
,
0xffffff01ffff0101
,
0xffffff01ffff01ff
,
0xffffff01ffffff01
,
0xffffff01ffffffff
,
0xffffffff01010101
,
0xffffffff010101ff
,
0xffffffff0101ff01
,
0xffffffff0101ffff
,
0xffffffff01ff0101
,
0xffffffff01ff01ff
,
0xffffffff01ffff01
,
0xffffffff01ffffff
,
0xffffffffff010101
,
0xffffffffff0101ff
,
0xffffffffff01ff01
,
0xffffffffff01ffff
,
0xffffffffffff0101
,
0xffffffffffff01ff
,
0xffffffffffffff01
,
0xffffffffffffffff
,
};
struct
SignHelper
{
IQK_ALWAYS_INLINE
void
apply_signs_1x
(
uint8x16_t
*
b
,
const
uint8_t
*
sign_bits
)
const
{
auto
s
=
vreinterpretq_s8_u64
(
uint64x2_t
{
kall_signs
[
sign_bits
[
0
]],
kall_signs
[
sign_bits
[
1
]]});
// Normally we would expect this to be faster, but it isn't.
// auto aux = vcombine_u8(vdup_n_u8(sign_bits[0]), vdup_n_u8(sign_bits[1]));
// auto s = vreinterpretq_s8_u8(vorrq_u8(vceqq_u8(vandq_u8(aux, smask), smask), m1));
b
[
0
]
=
vreinterpretq_u8_s8
(
vmulq_s8
(
vreinterpretq_s8_u8
(
b
[
0
]),
s
));
}
// We would need these two if we weren't loading from the unpacked sign table.
//const uint8x16_t smask = vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201));
//const uint8x16_t m1 = vdupq_n_u8(1);
};
struct
DequantizerIQ2S
final
:
public
BaseDequantizer
<
block_iq2_s
>
{
DequantizerIQ2S
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
16
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
SimpleBits
bits
;
float
d
;
inline
int32x4x4_t
new_block
(
int
i
)
{
d
=
0.125
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
prepare_internal
(
i
,
0
,
bits
);
return
prepare_4bit_scales16
(
x
[
i
]
.
scales
);
}
inline
void
prepare
(
int
i
,
int
j
)
{
if
(
j
==
1
)
prepare_internal
(
i
,
1
,
bits
);
}
private
:
static
void
make4
(
const
SignHelper
&
sh
,
const
uint8_t
*
sign_bits
,
const
uint8_t
*
qs
,
const
uint8_t
*
qh
,
uint8x16_t
*
b
)
{
uint32_t
aux32
[
2
];
const
uint16_t
*
aux16
=
(
const
uint16_t
*
)
aux32
;
for
(
int
k
=
0
;
k
<
2
;
++
k
)
{
aux32
[
1
]
=
(
qh
[
k
]
<<
4
)
|
(
qh
[
k
]
<<
18
);
aux32
[
0
]
=
(
aux32
[
1
]
<<
4
)
&
0x03000300
;
aux32
[
1
]
&=
0x03000300
;
b
[
2
*
k
+
0
]
=
vcombine_u8
(
vld1_u8
((
const
uint8_t
*
)(
iq2s_grid
+
(
qs
[
4
*
k
+
0
]
|
aux16
[
0
]))),
vld1_u8
((
const
uint8_t
*
)(
iq2s_grid
+
(
qs
[
4
*
k
+
1
]
|
aux16
[
1
]))));
b
[
2
*
k
+
1
]
=
vcombine_u8
(
vld1_u8
((
const
uint8_t
*
)(
iq2s_grid
+
(
qs
[
4
*
k
+
2
]
|
aux16
[
2
]))),
vld1_u8
((
const
uint8_t
*
)(
iq2s_grid
+
(
qs
[
4
*
k
+
3
]
|
aux16
[
3
]))));
sh
.
apply_signs_1x
(
b
+
2
*
k
+
0
,
sign_bits
);
sign_bits
+=
2
;
sh
.
apply_signs_1x
(
b
+
2
*
k
+
1
,
sign_bits
);
sign_bits
+=
2
;
}
}
void
prepare_internal
(
int
i
,
int
j
,
SimpleBits
&
sb
)
{
const
auto
*
qs
=
x
[
i
]
.
qs
+
16
*
j
;
const
auto
*
qh
=
x
[
i
]
.
qh
+
4
*
j
;
const
auto
*
sign_bits
=
qs
+
QK_K
/
8
;
make4
(
sh
,
sign_bits
+
0
,
qs
+
0
,
qh
+
0
,
sb
.
b1
.
val
);
make4
(
sh
,
sign_bits
+
8
,
qs
+
8
,
qh
+
2
,
sb
.
b2
.
val
);
}
SignHelper
sh
;
};
struct
DequantizerIQ3XXS
final
:
public
BaseDequantizer
<
block_iq3_xxs
>
{
DequantizerIQ3XXS
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
IQK_ALWAYS_INLINE
float
new_block
(
int
i
)
const
{
return
0.25
f
*
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
}
inline
int32x4_t
unpack
(
int
i
,
int
j
,
uint8x16_t
*
q
)
const
{
auto
q3data
=
vld1q_u8_x2
(
x
[
i
]
.
qs
+
32
*
j
);
auto
gas
=
vld1q_u32
((
const
uint32_t
*
)(
x
[
i
]
.
qs
+
QK_K
/
4
+
16
*
j
));
prepare_block
((
const
uint8_t
*
)
q3data
.
val
,
(
const
uint32_t
*
)
&
gas
,
q
);
return
prepare_scales_8
(
gas
);
}
private
:
inline
static
void
make2
(
const
uint8_t
*
q3
,
const
uint32_t
sidx
,
uint8x16_t
*
b
)
{
b
[
0
]
=
vreinterpretq_u8_u32
(
uint32x4_t
{
iq3xxs_grid
[
q3
[
0
]],
iq3xxs_grid
[
q3
[
1
]],
iq3xxs_grid
[
q3
[
2
]],
iq3xxs_grid
[
q3
[
3
]]});
b
[
1
]
=
vreinterpretq_u8_u32
(
uint32x4_t
{
iq3xxs_grid
[
q3
[
4
]],
iq3xxs_grid
[
q3
[
5
]],
iq3xxs_grid
[
q3
[
6
]],
iq3xxs_grid
[
q3
[
7
]]});
apply_signs_2
(
b
,
keven_signs
,
sidx
);
}
inline
static
void
prepare_block
(
const
uint8_t
*
q3
,
const
uint32_t
*
signs
,
uint8x16_t
*
quants
)
{
make2
(
q3
+
0
,
signs
[
0
],
quants
+
0
);
make2
(
q3
+
8
,
signs
[
1
],
quants
+
2
);
make2
(
q3
+
16
,
signs
[
2
],
quants
+
4
);
make2
(
q3
+
24
,
signs
[
3
],
quants
+
6
);
}
};
struct
DequantizerIQ3S
final
:
public
BaseDequantizer
<
block_iq3_s
>
{
DequantizerIQ3S
(
const
void
*
vx
,
size_t
bx
,
int
nrc
)
:
BaseDequantizer
(
vx
,
bx
,
nrc
)
{}
constexpr
static
int
num_blocks
()
{
return
8
;
}
constexpr
static
bool
should_scale_quants
()
{
return
false
;
}
SimpleBits
bits
;
float
d
;
inline
int32x4x2_t
new_block
(
int
i
)
{
d
=
GGML_FP16_TO_FP32
(
x
[
i
]
.
d
);
uint32_t
scales32
[
2
];
auto
qs
=
vld1q_u8_x2
(
x
[
i
]
.
qs
);
auto
signs
=
vld1q_u8
(
x
[
i
]
.
signs
);
prepare_block
((
const
uint8_t
*
)
qs
.
val
,
x
[
i
]
.
qh
,
(
const
uint8_t
*
)
&
signs
);
std
::
memcpy
(
scales32
,
x
[
i
]
.
scales
,
4
);
scales32
[
1
]
=
(((
scales32
[
0
]
>>
4
)
&
0x0f0f0f0f
)
<<
1
)
|
0x01010101
;
scales32
[
0
]
=
((
scales32
[
0
]
&
0x0f0f0f0f
)
<<
1
)
|
0x01010101
;
auto
scales8
=
vld1_u8
((
const
uint8_t
*
)
scales32
);
// 0, 2, 4, 6, 1, 3, 5, 7
scales8
=
vtbl1_u8
(
scales8
,
vreinterpret_u8_u64
(
vdup_n_u64
(
0x0703060205010400
)));
auto
scales16
=
vreinterpretq_s16_u16
(
vmovl_u8
(
scales8
));
int32x4x2_t
scales
;
scales
.
val
[
0
]
=
vmovl_s16
(
vget_low_s16
(
scales16
));
scales
.
val
[
1
]
=
vmovl_s16
(
vget_high_s16
(
scales16
));
return
scales
;
}
inline
void
prepare
(
int
i
,
int
j
)
{
if
(
j
==
1
)
{
auto
qs
=
vld1q_u8_x2
(
x
[
i
]
.
qs
+
32
);
auto
signs
=
vld1q_u8
(
x
[
i
]
.
signs
+
16
);
prepare_block
((
const
uint8_t
*
)
qs
.
val
,
x
[
i
]
.
qh
+
4
,
(
const
uint8_t
*
)
&
signs
);
}
}
private
:
static
inline
void
make2
(
const
SignHelper
&
sh
,
const
uint8_t
*
sign_bits
,
const
uint16x8_t
&
idx_l
,
uint8_t
qh
,
const
int16x8_t
&
hshift
,
uint8x16_t
*
b
)
{
auto
vindex
=
vorrq_u16
(
idx_l
,
vandq_u16
(
vshlq_u16
(
vdupq_n_u16
(
qh
),
hshift
),
vdupq_n_u16
(
256
)));
const
uint16_t
*
idx
=
(
const
uint16_t
*
)
&
vindex
;
b
[
0
]
=
vreinterpretq_u8_u32
(
uint32x4_t
{
iq3s_grid
[
idx
[
0
]],
iq3s_grid
[
idx
[
1
]],
iq3s_grid
[
idx
[
2
]],
iq3s_grid
[
idx
[
3
]]});
sh
.
apply_signs_1x
(
b
+
0
,
sign_bits
+
0
);
b
[
1
]
=
vreinterpretq_u8_u32
(
uint32x4_t
{
iq3s_grid
[
idx
[
4
]],
iq3s_grid
[
idx
[
5
]],
iq3s_grid
[
idx
[
6
]],
iq3s_grid
[
idx
[
7
]]});
sh
.
apply_signs_1x
(
b
+
1
,
sign_bits
+
2
);
}
static
inline
void
make4
(
const
SignHelper
&
sh
,
const
uint8_t
*
sign_bits
,
const
uint8_t
*
qs
,
const
uint8_t
*
qh
,
const
int16x8_t
&
hshift
,
uint8x16_t
*
b
)
{
auto
idx_l
=
vld1q_u8
(
qs
);
make2
(
sh
,
sign_bits
+
0
,
vmovl_u8
(
vget_low_u8
(
idx_l
)),
qh
[
0
],
hshift
,
b
+
0
);
make2
(
sh
,
sign_bits
+
4
,
vmovl_u8
(
vget_high_u8
(
idx_l
)),
qh
[
1
],
hshift
,
b
+
2
);
}
static
int16x8_t
load_shift
()
{
static
const
int16_t
k_shift
[
8
]
=
{
8
,
7
,
6
,
5
,
4
,
3
,
2
,
1
};
return
vld1q_s16
(
k_shift
);
}
inline
void
prepare_block
(
const
uint8_t
*
qs
,
const
uint8_t
*
qh
,
const
uint8_t
*
sign_bits
)
{
auto
signs
=
vld1q_u8
(
sign_bits
);
auto
s
=
(
const
uint8_t
*
)
&
signs
;
make4
(
sh
,
s
+
0
,
qs
+
0
,
qh
+
0
,
hshift
,
bits
.
b1
.
val
);
make4
(
sh
,
s
+
8
,
qs
+
16
,
qh
+
2
,
hshift
,
bits
.
b2
.
val
);
}
SignHelper
sh
;
const
int16x8_t
hshift
=
load_shift
();
};
template
<
int
nrc_y
,
typename
Dequantizer
>
IQK_NOINLINE
void
mul_mat_qX_K_q8_K_IQXXS
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
assert
(
n
%
QK_K
==
0
);
const
int
nb
=
n
/
QK_K
;
Q8
<
nrc_y
,
block_q8_K
>
q8
(
info
);
Dequantizer
deq
(
vx
,
bx
,
nrc_y
);
uint8x16_t
qx
[
8
];
int32x4_t
sumi
[
nrc_y
];
float32x4_t
acc
[
nrc_y
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq
.
new_row
(
ix
);
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
acc
[
iy
]
=
vdupq_n_f32
(
0.
f
);
for
(
int
i
=
0
;
i
<
nb
;
++
i
)
{
float
d
=
deq
.
new_block
(
i
);
auto
scales
=
deq
.
unpack
(
i
,
0
,
qx
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
sumi
[
iy
]
=
vdupq_n_s32
(
0
);
compute_8_blocks
((
const
int8x16_t
*
)
qx
,
q8
,
scales
,
iy
,
i
,
0
,
sumi
[
iy
]);
}
scales
=
deq
.
unpack
(
i
,
1
,
qx
);
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
compute_8_blocks
((
const
int8x16_t
*
)
qx
,
q8
,
scales
,
iy
,
i
,
1
,
sumi
[
iy
]);
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
vdupq_n_f32
(
d
*
q8
.
scale
(
iy
,
i
)),
vcvtq_f32_s32
(
sumi
[
iy
]));
}
}
#pragma GCC unroll 8
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
vaddvq_f32
(
acc
[
iy
]));
}
}
}
// =========================================== Legacy quants
template
<
typename
Block
>
inline
float16x4_t
load_scales_q0
(
const
Block
*
x
,
ggml_half
*
aux
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
aux
[
k
]
=
x
[
k
]
.
d
;
return
vld1_f16
((
const
float16_t
*
)
aux
);
}
template
<
typename
Block
>
inline
float16x8_t
load_scales_q1
(
const
Block
*
x
,
ggml_half
*
aux
)
{
if
constexpr
(
std
::
is_same_v
<
Block
,
block_q8_1
>
)
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux
[
k
]
=
x
[
k
]
.
d
;
aux
[
k
+
4
]
=
x
[
k
]
.
s
;
}
}
else
{
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux
[
k
]
=
x
[
k
]
.
d
;
aux
[
k
+
4
]
=
x
[
k
]
.
m
;
}
}
return
vld1q_f16
((
const
float16_t
*
)
aux
);
}
struct
Q4LegacyBits
{
template
<
typename
Block
>
inline
void
prepare
(
const
Block
*
x
)
{
for
(
int
i
=
0
;
i
<
4
;
++
i
)
{
auto
q4bits
=
vld1q_u8
(
x
[
i
]
.
qs
);
b
[
2
*
i
+
0
]
=
vreinterpretq_s8_u8
(
vandq_u8
(
q4bits
,
m4b
));
b
[
2
*
i
+
1
]
=
vreinterpretq_s8_u8
(
vshrq_n_u8
(
q4bits
,
4
));
}
}
inline
void
prepare1
(
const
uint8_t
*
qs
,
int8x16_t
*
q
)
const
{
auto
q4bits
=
vld1q_u8
(
qs
);
q
[
0
]
=
vreinterpretq_s8_u8
(
vandq_u8
(
q4bits
,
m4b
));
q
[
1
]
=
vreinterpretq_s8_u8
(
vshrq_n_u8
(
q4bits
,
4
));
}
inline
void
prepare1
(
const
uint8_t
*
qs
)
{
prepare1
(
qs
,
b
);
}
const
uint8x16_t
m4b
=
vdupq_n_u8
(
0xf
);
int8x16_t
b
[
8
];
};
// One would think this commented out version would do better than the one below
// because it offers more opportunities to execute instructions in parallel.
// Instead, it runs significantly slower. Why? If the compiler is running out of vector registers
// cannot it just do the sequential version below on its own?
//inline int32x4_t sum_4_blocks(const int8x16_t * b, const int8_t * qs) {
// const auto q8b_1 = vld1q_s8_x2(qs + 0);
// auto p12 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[0], q8b_1.val[0]), b[1], q8b_1.val[1]);
// const auto q8b_2 = vld1q_s8_x2(qs + 32);
// auto p34 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[2], q8b_2.val[0]), b[3], q8b_2.val[1]);
// auto p1234 = vpaddq_s32(p12, p34);
// const auto q8b_3 = vld1q_s8_x2(qs + 64);
// auto p56 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[4], q8b_3.val[0]), b[5], q8b_3.val[1]);
// const auto q8b_4 = vld1q_s8_x2(qs + 96);
// auto p78 = ggml_vdotq_s32(ggml_vdotq_s32(vdupq_n_s32(0), b[6], q8b_4.val[0]), b[7], q8b_4.val[1]);
// return vpaddq_s32(p1234, vpaddq_s32(p56, p78));
//}
inline
int32x4_t
sum_4_blocks
(
const
int8x16_t
*
b
,
const
int8_t
*
qs
)
{
auto
q8b
=
vld1q_s8_x2
(
qs
+
0
);
auto
p12
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
b
[
0
],
q8b
.
val
[
0
]),
b
[
1
],
q8b
.
val
[
1
]);
q8b
=
vld1q_s8_x2
(
qs
+
32
);
auto
p34
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
b
[
2
],
q8b
.
val
[
0
]),
b
[
3
],
q8b
.
val
[
1
]);
auto
p1234
=
vpaddq_s32
(
p12
,
p34
);
q8b
=
vld1q_s8_x2
(
qs
+
64
);
auto
p56
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
b
[
4
],
q8b
.
val
[
0
]),
b
[
5
],
q8b
.
val
[
1
]);
q8b
=
vld1q_s8_x2
(
qs
+
96
);
auto
p78
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
b
[
6
],
q8b
.
val
[
0
]),
b
[
7
],
q8b
.
val
[
1
]);
return
vpaddq_s32
(
p1234
,
vpaddq_s32
(
p56
,
p78
));
}
template
<
int
nrc
>
struct
Q80
{
constexpr
static
int
nrc_y
=
nrc
;
Q80
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
y
[
iy
]
=
(
const
block_q8_0
*
)
info
.
src1_row
(
iy
);
}
inline
const
int8_t
*
quant_data
(
int
iy
,
int
i
)
const
{
const
block_q8_0_x4
*
y4
=
(
const
block_q8_0_x4
*
)
y
[
iy
]
+
i
;
return
y4
->
qs
;
}
inline
float16x4_t
load_scales
(
int
iy
,
int
i
)
const
{
const
block_q8_0_x4
*
y4
=
(
const
block_q8_0_x4
*
)
y
[
iy
]
+
i
;
return
vld1_f16
((
const
float16_t
*
)
y4
->
d
);
}
template
<
typename
Dequantizer
>
inline
void
process_scales
(
int
i
,
Dequantizer
&
deq
,
float16x4_t
*
sc16
,
float32x4_t
*
/*acc*/
)
const
{
auto
qx_scales
=
deq
.
new_block
(
i
);
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
{
auto
q8_scales
=
load_scales
(
iy
,
i
);
sc16
[
iy
]
=
vmul_f16
(
qx_scales
,
q8_scales
);
}
}
template
<
typename
Dequantizer
>
inline
void
process_1_block
(
int
i
,
Dequantizer
&
deq
,
float32x4_t
*
acc
)
const
{
deq
.
prepare1
(
i
);
float
d
=
GGML_FP16_TO_FP32
(
deq
.
x
[
i
]
.
d
);
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
{
auto
q8b
=
vld1q_s8_x2
(
y
[
iy
][
i
]
.
qs
);
auto
p
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
deq
.
bits
.
b
[
0
],
q8b
.
val
[
0
]),
deq
.
bits
.
b
[
1
],
q8b
.
val
[
1
]);
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
vdupq_n_f32
(
d
*
GGML_FP16_TO_FP32
(
y
[
iy
][
i
]
.
d
)),
vcvtq_f32_s32
(
p
));
}
}
const
block_q8_0
*
y
[
nrc_y
];
};
template
<
int
nrc
>
struct
Q81
{
constexpr
static
int
nrc_y
=
nrc
;
Q81
(
const
DataInfo
&
info
)
{
for
(
int
iy
=
0
;
iy
<
nrc_y
;
++
iy
)
y
[
iy
]
=
(
const
block_q8_1
*
)
info
.
src1_row
(
iy
);
}
inline
const
int8_t
*
quant_data
(
int
iy
,
int
i
)
const
{
const
block_q8_1_x4
*
y4
=
(
const
block_q8_1_x4
*
)
y
[
iy
]
+
i
;
return
y4
->
qs
;
}
inline
float16x8_t
load_scales
(
int
iy
,
int
i
)
const
{
const
block_q8_1_x4
*
y4
=
(
const
block_q8_1_x4
*
)
y
[
iy
]
+
i
;
return
vld1q_f16
((
const
float16_t
*
)
y4
->
d
);
}
template
<
typename
Dequantizer
>
inline
void
process_scales
(
int
i
,
Dequantizer
&
deq
,
float16x4_t
*
sc16
,
float32x4_t
*
acc
)
const
{
auto
qx_scales
=
deq
.
new_block
(
i
);
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
{
auto
q8_scales
=
load_scales
(
iy
,
i
);
auto
m
=
vmul_f16
(
vget_high_f16
(
qx_scales
),
vget_high_f16
(
q8_scales
));
acc
[
iy
]
=
vaddq_f32
(
acc
[
iy
],
vcvt_f32_f16
(
m
));
sc16
[
iy
]
=
vmul_f16
(
vget_low_f16
(
qx_scales
),
vget_low_f16
(
q8_scales
));
}
}
template
<
typename
Dequantizer
>
inline
void
process_1_block
(
int
i
,
Dequantizer
&
deq
,
float32x4_t
*
acc
)
const
{
deq
.
prepare1
(
i
);
float
d
=
GGML_FP16_TO_FP32
(
deq
.
x
[
i
]
.
d
),
m
=
0.25
f
*
GGML_FP16_TO_FP32
(
deq
.
x
[
i
]
.
m
);
for
(
int
iy
=
0
;
iy
<
nrc
;
++
iy
)
{
auto
q8b
=
vld1q_s8_x2
(
y
[
iy
][
i
]
.
qs
);
auto
p
=
ggml_vdotq_s32
(
ggml_vdotq_s32
(
vdupq_n_s32
(
0
),
deq
.
bits
.
b
[
0
],
q8b
.
val
[
0
]),
deq
.
bits
.
b
[
1
],
q8b
.
val
[
1
]);
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
vdupq_n_f32
(
d
*
GGML_FP16_TO_FP32
(
y
[
iy
][
i
]
.
d
)),
vcvtq_f32_s32
(
p
));
acc
[
iy
]
=
vaddq_f32
(
acc
[
iy
],
vdupq_n_f32
(
m
*
GGML_FP16_TO_FP32
(
y
[
iy
][
i
]
.
s
)));
}
}
const
block_q8_1
*
y
[
nrc_y
];
};
template
<
typename
block_q
>
struct
BaseLegacyDequantizer
{
BaseLegacyDequantizer
(
const
void
*
vx
,
size_t
bx
)
:
vx
(
vx
),
x
(
nullptr
),
bx
(
bx
)
{}
inline
void
new_row
(
int
ix
)
{
x
=
(
const
block_q
*
)((
const
char
*
)
vx
+
bx
*
ix
);
}
Q4LegacyBits
bits
;
const
void
*
vx
;
const
block_q
*
x
;
size_t
bx
;
};
struct
DequantizerQ40
final
:
public
BaseLegacyDequantizer
<
block_q4_0
>
{
DequantizerQ40
(
const
void
*
vx
,
size_t
bx
)
:
BaseLegacyDequantizer
(
vx
,
bx
)
{}
inline
void
prepare1
(
int
i
,
int8x16_t
*
q
)
const
{
bits
.
prepare1
(
x
[
i
]
.
qs
,
q
);
q
[
0
]
=
vaddq_s8
(
q
[
0
],
m8
);
q
[
1
]
=
vaddq_s8
(
q
[
1
],
m8
);
}
inline
void
prepare1
(
int
i
)
{
prepare1
(
i
,
bits
.
b
);
}
inline
float16x4_t
new_block
(
int
i
)
{
ggml_half
aux
[
4
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux
[
k
]
=
x
[
4
*
i
+
k
]
.
d
;
prepare1
(
4
*
i
+
k
,
bits
.
b
+
2
*
k
);
}
return
vld1_f16
((
const
float16_t
*
)
aux
);
}
const
int8x16_t
m8
=
vdupq_n_s8
(
-
8
);
//ggml_half aux[4];
};
struct
DequantizerQ41
:
public
BaseLegacyDequantizer
<
block_q4_1
>
{
DequantizerQ41
(
const
void
*
vx
,
size_t
bx
)
:
BaseLegacyDequantizer
(
vx
,
bx
)
{}
inline
void
prepare1
(
int
i
)
{
bits
.
prepare1
(
x
[
i
]
.
qs
);
}
inline
float16x8_t
new_block
(
int
i
)
{
uint32_t
aux32
[
4
];
const
uint32_t
*
s32
=
(
const
uint32_t
*
)
&
x
[
4
*
i
]
.
d
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux32
[
k
]
=
*
s32
;
s32
+=
sizeof
(
block_q4_1
)
/
4
;
bits
.
prepare1
(
x
[
4
*
i
+
k
]
.
qs
,
bits
.
b
+
2
*
k
);
}
return
vreinterpretq_f16_u8
(
vqtbl1q_u8
(
vld1q_u8
((
const
uint8_t
*
)
aux32
),
vreinterpretq_u8_u64
(
shuffle
)));
}
// Leaving this commented out attempt to be reminded that I already tried this.
// It has basically the same performance as the version above.
//inline float16x8_t new_block(int i) {
// uint32x4_t scales = {};
// const block_q4_1 * xi = x + 4*i;
// const uint32_t * s32 = (const uint32_t *)&xi->d;
// scales = vsetq_lane_u32(*s32, scales, 0); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[0].qs, bits.b + 0);
// scales = vsetq_lane_u32(*s32, scales, 1); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[1].qs, bits.b + 2);
// scales = vsetq_lane_u32(*s32, scales, 2); s32 += sizeof(block_q4_1)/4;
// bits.prepare1(xi[2].qs, bits.b + 4);
// scales = vsetq_lane_u32(*s32, scales, 3);
// bits.prepare1(xi[3].qs, bits.b + 6);
// return vreinterpretq_f16_u8(vqtbl1q_u8(vreinterpretq_u8_u32(scales), vreinterpretq_u8_u64(shuffle)));
//}
const
uint64x2_t
shuffle
=
{
0x0d0c090805040100
,
0x0f0e0b0a07060302
};
};
struct
HighBit5Legacy
{
inline
uint8x16_t
to_bytes
(
const
uint8_t
*
qh
)
const
{
uint8x16_t
h
=
vqtbl1q_u8
(
vreinterpretq_u8_u16
(
vdupq_n_u16
(
*
(
const
uint16_t
*
)
qh
)),
shuffle
);
return
vceqq_u8
(
vandq_u8
(
h
,
vreinterpretq_u8_u64
(
mask
)),
vreinterpretq_u8_u64
(
mask
));
}
inline
uint8x16_t
to_negated_bytes
(
const
uint8_t
*
qh
)
const
{
uint8x16_t
h
=
vqtbl1q_u8
(
vreinterpretq_u8_u16
(
vdupq_n_u16
(
*
(
const
uint16_t
*
)
qh
)),
shuffle
);
return
vceqq_u8
(
vandq_u8
(
h
,
vreinterpretq_u8_u64
(
mask
)),
vdupq_n_u8
(
0
));
}
const
uint64x2_t
mask
=
vdupq_n_u64
(
0x8040201008040201
);
const
uint8x16_t
shuffle
=
vcombine_u8
(
vdup_n_u8
(
0
),
vdup_n_u8
(
1
));
};
struct
DequantizerQ50
final
:
public
BaseLegacyDequantizer
<
block_q5_0
>
{
DequantizerQ50
(
const
void
*
vx
,
size_t
bx
)
:
BaseLegacyDequantizer
(
vx
,
bx
)
{}
inline
void
prepare1
(
int
i
,
int8x16_t
*
q
)
const
{
bits
.
prepare1
(
x
[
i
]
.
qs
,
q
);
auto
qh
=
x
[
i
]
.
qh
;
q
[
0
]
=
vreinterpretq_s8_u8
(
vorrq_u8
(
vreinterpretq_u8_s8
(
q
[
0
]),
vandq_u8
(
mh
,
hbits
.
to_negated_bytes
(
qh
+
0
))));
q
[
1
]
=
vreinterpretq_s8_u8
(
vorrq_u8
(
vreinterpretq_u8_s8
(
q
[
1
]),
vandq_u8
(
mh
,
hbits
.
to_negated_bytes
(
qh
+
2
))));
}
inline
void
prepare1
(
int
i
)
{
prepare1
(
i
,
bits
.
b
);
}
inline
float16x4_t
new_block
(
int
i
)
{
ggml_half
aux
[
4
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux
[
k
]
=
x
[
4
*
i
+
k
]
.
d
;
prepare1
(
4
*
i
+
k
,
bits
.
b
+
2
*
k
);
}
return
vld1_f16
((
const
float16_t
*
)
aux
);
}
HighBit5Legacy
hbits
;
const
uint8x16_t
mh
=
vdupq_n_u8
(
0xf0
);
};
struct
DequantizerQ80
final
:
public
BaseLegacyDequantizer
<
block_q8_0
>
{
DequantizerQ80
(
const
void
*
vx
,
size_t
bx
)
:
BaseLegacyDequantizer
(
vx
,
bx
)
{}
inline
void
prepare1
(
int
i
)
{
bits
.
b
[
0
]
=
vld1q_s8
(
x
[
i
]
.
qs
);
bits
.
b
[
1
]
=
vld1q_s8
(
x
[
i
]
.
qs
+
16
);
}
inline
float16x4_t
new_block
(
int
i
)
{
ggml_half
aux
[
4
];
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux
[
k
]
=
x
[
4
*
i
+
k
]
.
d
;
bits
.
b
[
2
*
k
+
0
]
=
vld1q_s8
(
x
[
4
*
i
+
k
]
.
qs
);
bits
.
b
[
2
*
k
+
1
]
=
vld1q_s8
(
x
[
4
*
i
+
k
]
.
qs
+
16
);
}
return
vld1_f16
((
const
float16_t
*
)
aux
);
}
};
struct
DequantizerQ51
final
:
public
BaseLegacyDequantizer
<
block_q5_1
>
{
DequantizerQ51
(
const
void
*
vx
,
size_t
bx
)
:
BaseLegacyDequantizer
(
vx
,
bx
)
{}
inline
void
prepare1
(
int
i
,
int8x16_t
*
q
)
const
{
bits
.
prepare1
(
x
[
i
]
.
qs
,
q
);
auto
qh
=
x
[
i
]
.
qh
;
q
[
0
]
=
vreinterpretq_s8_u8
(
vorrq_u8
(
vreinterpretq_u8_s8
(
q
[
0
]),
vandq_u8
(
mh
,
hbits
.
to_bytes
(
qh
+
0
))));
q
[
1
]
=
vreinterpretq_s8_u8
(
vorrq_u8
(
vreinterpretq_u8_s8
(
q
[
1
]),
vandq_u8
(
mh
,
hbits
.
to_bytes
(
qh
+
2
))));
}
inline
void
prepare1
(
int
i
)
{
bits
.
prepare1
(
x
[
i
]
.
qs
,
bits
.
b
);
}
inline
float16x8_t
new_block
(
int
i
)
{
uint32_t
aux32
[
4
];
const
uint32_t
*
s32
=
(
const
uint32_t
*
)
&
x
[
4
*
i
]
.
d
;
for
(
int
k
=
0
;
k
<
4
;
++
k
)
{
aux32
[
k
]
=
*
s32
;
s32
+=
sizeof
(
block_q5_1
)
/
4
;
prepare1
(
4
*
i
+
k
,
bits
.
b
+
2
*
k
);
}
return
vreinterpretq_f16_u8
(
vqtbl1q_u8
(
vld1q_u8
((
const
uint8_t
*
)
aux32
),
vreinterpretq_u8_u64
(
shuffle
)));
}
HighBit5Legacy
hbits
;
const
uint8x16_t
mh
=
vdupq_n_u8
(
0x10
);
const
uint64x2_t
shuffle
=
{
0x0d0c090805040100
,
0x0f0e0b0a07060302
};
};
template
<
typename
Dequantizer
,
typename
Q8
>
inline
void
sum_4
(
int
i
,
Dequantizer
&
deq
,
const
Q8
&
q8
,
const
float16x4_t
*
sc16
,
float32x4_t
*
acc
)
{
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
auto
pall
=
sum_4_blocks
(
deq
.
bits
.
b
,
q8
.
quant_data
(
iy
,
i
));
auto
scale
=
vcvt_f32_f16
(
sc16
[
iy
]);
acc
[
iy
]
=
vmlaq_f32
(
acc
[
iy
],
scale
,
vcvtq_f32_s32
(
pall
));
}
}
template
<
typename
Dequantizer
,
typename
Q8
>
inline
void
mul_mat_qX_Y_q8_Y
(
int
n
,
Dequantizer
&
deq
,
Q8
&
q8
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK4_1
;
float16x4_t
sc16
[
Q8
::
nrc_y
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq
.
new_row
(
ix
);
float32x4_t
acc
[
Q8
::
nrc_y
];
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
acc
[
iy
]
=
vdupq_n_f32
(
0.
f
);
for
(
int
i
=
0
;
i
<
nb
/
4
;
++
i
)
{
q8
.
process_scales
(
i
,
deq
,
sc16
,
acc
);
sum_4
(
i
,
deq
,
q8
,
sc16
,
acc
);
}
for
(
int
i
=
4
*
(
nb
/
4
);
i
<
nb
;
++
i
)
{
q8
.
process_1_block
(
i
,
deq
,
acc
);
}
for
(
int
iy
=
0
;
iy
<
Q8
::
nrc_y
;
++
iy
)
{
info
.
store
(
ix
,
iy
,
vaddvq_f32
(
acc
[
iy
]));
}
}
}
template
<
typename
Dequantizer
,
typename
Q8
>
inline
void
mul_mat_qX_Y_q8_Y_1
(
int
n
,
Dequantizer
&
deq1
,
Dequantizer
&
deq2
,
Q8
&
q8
,
const
DataInfo
&
info
,
int
nrc_x
)
{
const
int
nb
=
n
/
QK4_1
;
float16x4_t
sc16
[
2
];
for
(
int
ix
=
0
;
ix
<
nrc_x
;
++
ix
)
{
deq1
.
new_row
(
ix
);
deq2
.
new_row
(
ix
);
float32x4_t
acc
[
2
]
=
{
vdupq_n_f32
(
0.
f
),
vdupq_n_f32
(
0.
f
)
};
for
(
int
i
=
0
;
i
<
nb
/
8
;
++
i
)
{
q8
.
process_scales
(
2
*
i
+
0
,
deq1
,
sc16
+
0
,
acc
+
0
);
q8
.
process_scales
(
2
*
i
+
1
,
deq2
,
sc16
+
1
,
acc
+
1
);
sum_4
(
2
*
i
+
0
,
deq1
,
q8
,
sc16
+
0
,
acc
+
0
);
sum_4
(
2
*
i
+
1
,
deq2
,
q8
,
sc16
+
1
,
acc
+
1
);
}
for
(
int
i
=
2
*
(
nb
/
8
);
i
<
nb
/
4
;
++
i
)
{
q8
.
process_scales
(
i
,
deq1
,
sc16
,
acc
);
sum_4
(
i
,
deq1
,
q8
,
sc16
,
acc
);
}
for
(
int
i
=
4
*
(
nb
/
4
);
i
<
nb
;
++
i
)
{
q8
.
process_1_block
(
i
,
deq1
,
acc
);
}
info
.
store
(
ix
,
0
,
vaddvq_f32
(
vaddq_f32
(
acc
[
0
],
acc
[
1
])));
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
IQK_NOINLINE
mul_mat_qX_1_q8_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
Q81
<
nrc_y
>
q8
(
info
);
if
constexpr
(
nrc_y
==
1
)
{
Dequantizer
deq1
(
vx
,
bx
),
deq2
(
vx
,
bx
);
mul_mat_qX_Y_q8_Y_1
(
n
,
deq1
,
deq2
,
q8
,
info
,
nrc_x
);
}
else
{
Dequantizer
deq
(
vx
,
bx
);
mul_mat_qX_Y_q8_Y
(
n
,
deq
,
q8
,
info
,
nrc_x
);
}
}
template
<
typename
Dequantizer
,
int
nrc_y
>
static
void
IQK_NOINLINE
mul_mat_qX_0_q8_0
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
Q80
<
nrc_y
>
q8
(
info
);
if
constexpr
(
nrc_y
==
1
)
{
Dequantizer
deq1
(
vx
,
bx
),
deq2
(
vx
,
bx
);
mul_mat_qX_Y_q8_Y_1
(
n
,
deq1
,
deq2
,
q8
,
info
,
nrc_x
);
}
else
{
Dequantizer
deq
(
vx
,
bx
);
mul_mat_qX_Y_q8_Y
(
n
,
deq
,
q8
,
info
,
nrc_x
);
}
}
template
<
typename
Dequantizer
>
static
void
IQK_NOINLINE
mul_mat_qX_1_q8_1_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
Dequantizer
deq1
(
vx
,
bx
),
deq2
(
vx
,
bx
);
Q81
<
1
>
q8
(
info
);
mul_mat_qX_Y_q8_Y_1
(
n
,
deq1
,
deq2
,
q8
,
info
,
nrc_x
);
}
template
<
typename
Dequantizer
>
static
void
IQK_NOINLINE
mul_mat_qX_0_q8_0_1
(
int
n
,
const
void
*
vx
,
size_t
bx
,
const
DataInfo
&
info
,
int
nrc_x
)
{
Dequantizer
deq1
(
vx
,
bx
),
deq2
(
vx
,
bx
);
Q80
<
1
>
q8
(
info
);
mul_mat_qX_Y_q8_Y
(
n
,
deq1
,
deq2
,
q8
,
info
,
nrc_x
);
}
template
<
typename
Dequantizer
>
void
MulMat
::
set_functions
(
MulMat
&
m
)
{
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ40
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ50
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ80
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_0_q8_0
<
Dequantizer
,
8
>
;
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerQ41
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerQ51
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
1
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
2
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
3
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
4
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
5
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
6
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
7
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_1_q8_1
<
Dequantizer
,
8
>
;
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ2XXS
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ3XXS
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_IQXXS
<
1
,
Dequantizer
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_IQXXS
<
2
,
Dequantizer
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_IQXXS
<
3
,
Dequantizer
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_IQXXS
<
4
,
Dequantizer
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_IQXXS
<
5
,
Dequantizer
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_IQXXS
<
6
,
Dequantizer
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_IQXXS
<
7
,
Dequantizer
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_IQXXS
<
8
,
Dequantizer
>
;
}
else
if
constexpr
(
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ2S
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ3S
>
||
std
::
is_same_v
<
Dequantizer
,
DequantizerIQ2XS
>
)
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_IQ
<
1
,
Dequantizer
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_IQ
<
2
,
Dequantizer
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_IQ
<
3
,
Dequantizer
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_IQ
<
4
,
Dequantizer
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_IQ
<
5
,
Dequantizer
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_IQ
<
6
,
Dequantizer
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_IQ
<
7
,
Dequantizer
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_IQ
<
8
,
Dequantizer
>
;
}
else
{
m
.
funcs
[
0
]
=
mul_mat_qX_K_q8_K_T
<
1
,
Dequantizer
>
;
m
.
funcs
[
1
]
=
mul_mat_qX_K_q8_K_T
<
2
,
Dequantizer
>
;
m
.
funcs
[
2
]
=
mul_mat_qX_K_q8_K_T
<
3
,
Dequantizer
>
;
m
.
funcs
[
3
]
=
mul_mat_qX_K_q8_K_T
<
4
,
Dequantizer
>
;
m
.
funcs
[
4
]
=
mul_mat_qX_K_q8_K_T
<
5
,
Dequantizer
>
;
m
.
funcs
[
5
]
=
mul_mat_qX_K_q8_K_T
<
6
,
Dequantizer
>
;
m
.
funcs
[
6
]
=
mul_mat_qX_K_q8_K_T
<
7
,
Dequantizer
>
;
m
.
funcs
[
7
]
=
mul_mat_qX_K_q8_K_T
<
8
,
Dequantizer
>
;
}
}
bool
MulMat
::
set_mul_mat
(
int
typeA
,
int
ne00
,
MulMat
&
m
,
int
&
row_size_q8
,
int
Ny
)
{
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_K
,
ne00
);
(
void
)
Ny
;
// Uncommenting out this would disable iqk_mul_mat for matrix x vector multiplications.
//if (Ny == 1 && (typeA == GGML_TYPE_IQ2_XXS || typeA == GGML_TYPE_IQ2_XS || typeA == GGML_TYPE_IQ2_S ||
// typeA == GGML_TYPE_IQ3_XXS || typeA == GGML_TYPE_IQ3_S)) return false;
switch
(
typeA
)
{
case
GGML_TYPE_Q2_K
:
MulMat
::
set_functions
<
DequantizerQ2K
>
(
m
);
break
;
case
GGML_TYPE_Q3_K
:
MulMat
::
set_functions
<
DequantizerQ3K
>
(
m
);
break
;
case
GGML_TYPE_Q4_K
:
MulMat
::
set_functions
<
DequantizerQ4K
>
(
m
);
break
;
case
GGML_TYPE_Q5_K
:
MulMat
::
set_functions
<
DequantizerQ5K
>
(
m
);
break
;
case
GGML_TYPE_Q6_K
:
MulMat
::
set_functions
<
DequantizerQ6K
>
(
m
);
break
;
case
GGML_TYPE_IQ4_XS
:
MulMat
::
set_functions
<
DequantizerIQ4XS
>
(
m
);
break
;
case
GGML_TYPE_IQ3_S
:
MulMat
::
set_functions
<
DequantizerIQ3S
>
(
m
);
break
;
case
GGML_TYPE_IQ3_XXS
:
MulMat
::
set_functions
<
DequantizerIQ3XXS
>
(
m
);
break
;
case
GGML_TYPE_IQ2_S
:
MulMat
::
set_functions
<
DequantizerIQ2S
>
(
m
);
break
;
case
GGML_TYPE_IQ2_XS
:
MulMat
::
set_functions
<
DequantizerIQ2XS
>
(
m
);
break
;
case
GGML_TYPE_IQ2_XXS
:
MulMat
::
set_functions
<
DequantizerIQ2XXS
>
(
m
);
break
;
case
GGML_TYPE_Q4_0
:
MulMat
::
set_functions
<
DequantizerQ40
>
(
m
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
);
break
;
case
GGML_TYPE_Q4_1
:
MulMat
::
set_functions
<
DequantizerQ41
>
(
m
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
break
;
case
GGML_TYPE_Q5_0
:
MulMat
::
set_functions
<
DequantizerQ50
>
(
m
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
);
break
;
case
GGML_TYPE_Q5_1
:
MulMat
::
set_functions
<
DequantizerQ51
>
(
m
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_1
,
ne00
);
break
;
case
GGML_TYPE_Q8_0
:
MulMat
::
set_functions
<
DequantizerQ80
>
(
m
);
row_size_q8
=
ggml_row_size
(
GGML_TYPE_Q8_0
,
ne00
);
break
;
default
:
return
false
;
}
return
true
;
}
}
#endif // __x86_64__ or __aarch64__
third_party/llamafile/iqk_mul_mat_amd_avx2.cpp
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_avx2.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef __x86_64__
#include "iqk_mul_mat.inc"
#endif // __x86_64__
third_party/llamafile/iqk_mul_mat_amd_zen4.cpp
0 → 100644
View file @
18c42e67
// Adapted from
// https://github.com/Mozilla-Ocho/llamafile/blob/0.8.8/llamafile/iqk_mul_mat_amd_zen4.cpp
// Copyrigth 2024 Iwan Kawrakow.
// Copyright(c) 2024 by KVCache.AI, All Rights Reserved.
#ifdef __x86_64__
#define iqk_mul_mat iqk_mul_mat_zen4
#define iqk_mul_mat_moe iqk_mul_mat_moe_zen4
#include "iqk_mul_mat.inc"
#endif // __x86_64__
Prev
1
…
7
8
9
10
11
12
13
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment