fn(api.ProgressResponse{Status:fmt.Sprintf("quantizing %s model to %s",ft,quantizeType),Digest:"0",Total:layer.Size,Completed:int64(progress*float32(layer.Size))})
fn(api.ProgressResponse{Status:fmt.Sprintf("quantizing %s model to %s",ft,quantizeType),Digest:"0000000000000000000",Total:layer.Size,Completed:int64(progress*float32(layer.Size))})
// Check if first dimension is divisible by block size
ifnx%qk_k!=0{
slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s. Falling back to quantization %s",nx,ny,qk_k,newType.String(),fsggml.TensorTypeF16.String()))
newType=fsggml.TensorTypeF16
// Store the original type for logging
originalType:=newType
// Select appropriate fallback based on original type
switchnewType{
casefsggml.TensorTypeQ4_K:
newType=fsggml.TensorTypeQ5_0
casefsggml.TensorTypeQ5_K:
newType=fsggml.TensorTypeQ5_1
casefsggml.TensorTypeQ6_K:
newType=fsggml.TensorTypeQ8_0
}
// Final check - if still incompatible, fall back to F16
ifnx%newType.BlockSize()!=0{
newType=fsggml.TensorTypeF16
}
slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
returnnil,nil,nil,fmt.Errorf("'llama3.2-vision' is no longer compatible with your version of Ollama and has been replaced by a newer version. To re-download, run 'ollama pull llama3.2-vision'")
}
iferr:=model.CheckCapabilities(caps...);err!=nil{
returnnil,nil,nil,fmt.Errorf("%s %w",name,err)
}
...
...
@@ -181,6 +186,13 @@ func (s *Server) GenerateHandler(c *gin.Context) {
ifreq.Suffix!=""{
caps=append(caps,model.CapabilityInsert)
}
ifreq.Think!=nil&&*req.Think{
caps=append(caps,model.CapabilityThinking)
// TODO(drifkin): consider adding a warning if it's false and the model
// doesn't support thinking. It's not strictly required, but it can be a
// hint that the user is on an older qwen3/r1 model that doesn't have an