Skip to content

Commit 85a07c6

Browse files
committed
Merge branch 'concedo_experimental' into esocrok
2 parents 759ea68 + 126104f commit 85a07c6

File tree

15 files changed

+604
-137
lines changed

15 files changed

+604
-137
lines changed

common/arg.cpp

Lines changed: 130 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,124 @@ std::pair<long, std::vector<char>> common_remote_get_content(const std::string &
747747

748748
#endif // LLAMA_USE_CURL
749749

750+
//
751+
// Docker registry functions
752+
//
753+
754+
static std::string common_docker_get_token(const std::string & repo) {
755+
std::string url = "https://auth.docker.io/token?service=registry.docker.io&scope=repository:" + repo + ":pull";
756+
757+
common_remote_params params;
758+
auto res = common_remote_get_content(url, params);
759+
760+
if (res.first != 200) {
761+
throw std::runtime_error("Failed to get Docker registry token, HTTP code: " + std::to_string(res.first));
762+
}
763+
764+
std::string response_str(res.second.begin(), res.second.end());
765+
nlohmann::ordered_json response = nlohmann::ordered_json::parse(response_str);
766+
767+
if (!response.contains("token")) {
768+
throw std::runtime_error("Docker registry token response missing 'token' field");
769+
}
770+
771+
return response["token"].get<std::string>();
772+
}
773+
774+
static std::string common_docker_resolve_model(const std::string & docker) {
775+
// Parse ai/smollm2:135M-Q4_K_M
776+
size_t colon_pos = docker.find(':');
777+
std::string repo, tag;
778+
if (colon_pos != std::string::npos) {
779+
repo = docker.substr(0, colon_pos);
780+
tag = docker.substr(colon_pos + 1);
781+
} else {
782+
repo = docker;
783+
tag = "latest";
784+
}
785+
786+
// ai/ is the default
787+
size_t slash_pos = docker.find('/');
788+
if (slash_pos == std::string::npos) {
789+
repo.insert(0, "ai/");
790+
}
791+
792+
LOG_INF("%s: Downloading Docker Model: %s:%s\n", __func__, repo.c_str(), tag.c_str());
793+
try {
794+
// --- helper: digest validation ---
795+
auto validate_oci_digest = [](const std::string & digest) -> std::string {
796+
// Expected: algo:hex ; start with sha256 (64 hex chars)
797+
// You can extend this map if supporting other algorithms in future.
798+
static const std::regex re("^sha256:([a-fA-F0-9]{64})$");
799+
std::smatch m;
800+
if (!std::regex_match(digest, m, re)) {
801+
throw std::runtime_error("Invalid OCI digest format received in manifest: " + digest);
802+
}
803+
// normalize hex to lowercase
804+
std::string normalized = digest;
805+
std::transform(normalized.begin()+7, normalized.end(), normalized.begin()+7, [](unsigned char c){
806+
return std::tolower(c);
807+
});
808+
return normalized;
809+
};
810+
811+
std::string token = common_docker_get_token(repo); // Get authentication token
812+
813+
// Get manifest
814+
const std::string url_prefix = "https://registry-1.docker.io/v2/" + repo;
815+
std::string manifest_url = url_prefix + "/manifests/" + tag;
816+
common_remote_params manifest_params;
817+
manifest_params.headers.push_back("Authorization: Bearer " + token);
818+
manifest_params.headers.push_back(
819+
"Accept: application/vnd.docker.distribution.manifest.v2+json,application/vnd.oci.image.manifest.v1+json");
820+
auto manifest_res = common_remote_get_content(manifest_url, manifest_params);
821+
if (manifest_res.first != 200) {
822+
throw std::runtime_error("Failed to get Docker manifest, HTTP code: " + std::to_string(manifest_res.first));
823+
}
824+
825+
std::string manifest_str(manifest_res.second.begin(), manifest_res.second.end());
826+
nlohmann::ordered_json manifest = nlohmann::ordered_json::parse(manifest_str);
827+
std::string gguf_digest; // Find the GGUF layer
828+
if (manifest.contains("layers")) {
829+
for (const auto & layer : manifest["layers"]) {
830+
if (layer.contains("mediaType")) {
831+
std::string media_type = layer["mediaType"].get<std::string>();
832+
if (media_type == "application/vnd.docker.ai.gguf.v3" ||
833+
media_type.find("gguf") != std::string::npos) {
834+
gguf_digest = layer["digest"].get<std::string>();
835+
break;
836+
}
837+
}
838+
}
839+
}
840+
841+
if (gguf_digest.empty()) {
842+
throw std::runtime_error("No GGUF layer found in Docker manifest");
843+
}
844+
845+
// Validate & normalize digest
846+
gguf_digest = validate_oci_digest(gguf_digest);
847+
LOG_DBG("%s: Using validated digest: %s\n", __func__, gguf_digest.c_str());
848+
849+
// Prepare local filename
850+
std::string model_filename = repo;
851+
std::replace(model_filename.begin(), model_filename.end(), '/', '_');
852+
model_filename += "_" + tag + ".gguf";
853+
std::string local_path = fs_get_cache_file(model_filename);
854+
855+
const std::string blob_url = url_prefix + "/blobs/" + gguf_digest;
856+
if (!common_download_file_single(blob_url, local_path, token, false)) {
857+
throw std::runtime_error("Failed to download Docker Model");
858+
}
859+
860+
LOG_INF("%s: Downloaded Docker Model to: %s\n", __func__, local_path.c_str());
861+
return local_path;
862+
} catch (const std::exception & e) {
863+
LOG_ERR("%s: Docker Model download failed: %s\n", __func__, e.what());
864+
throw;
865+
}
866+
}
867+
750868
//
751869
// utils
752870
//
@@ -797,7 +915,9 @@ static handle_model_result common_params_handle_model(
797915
handle_model_result result;
798916
// handle pre-fill default model path and url based on hf_repo and hf_file
799917
{
800-
if (!model.hf_repo.empty()) {
918+
if (!model.docker_repo.empty()) { // Handle Docker URLs by resolving them to local paths
919+
model.path = common_docker_resolve_model(model.docker_repo);
920+
} else if (!model.hf_repo.empty()) {
801921
// short-hand to avoid specifying --hf-file -> default it to --model
802922
if (model.hf_file.empty()) {
803923
if (model.path.empty()) {
@@ -2638,6 +2758,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
26382758
params.model.url = value;
26392759
}
26402760
).set_env("LLAMA_ARG_MODEL_URL"));
2761+
add_opt(common_arg(
2762+
{ "-dr", "--docker-repo" }, "[<repo>/]<model>[:quant]",
2763+
"Docker Hub model repository. repo is optional, default to ai/. quant is optional, default to :latest.\n"
2764+
"example: gemma3\n"
2765+
"(default: unused)",
2766+
[](common_params & params, const std::string & value) {
2767+
params.model.docker_repo = value;
2768+
}
2769+
).set_env("LLAMA_ARG_DOCKER_REPO"));
26412770
add_opt(common_arg(
26422771
{"-hf", "-hfr", "--hf-repo"}, "<user>/<model>[:quant]",
26432772
"Hugging Face model repository; quant is optional, case-insensitive, default to Q4_K_M, or falls back to the first file in the repo if Q4_K_M doesn't exist.\n"

common/common.h

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -189,10 +189,11 @@ struct common_params_sampling {
189189
};
190190

191191
struct common_params_model {
192-
std::string path = ""; // model local path // NOLINT
193-
std::string url = ""; // model url to download // NOLINT
194-
std::string hf_repo = ""; // HF repo // NOLINT
195-
std::string hf_file = ""; // HF file // NOLINT
192+
std::string path = ""; // model local path // NOLINT
193+
std::string url = ""; // model url to download // NOLINT
194+
std::string hf_repo = ""; // HF repo // NOLINT
195+
std::string hf_file = ""; // HF file // NOLINT
196+
std::string docker_repo = ""; // Docker repo // NOLINT
196197
};
197198

198199
struct common_params_speculative {
@@ -448,7 +449,7 @@ struct common_params {
448449

449450
std::string slot_save_path;
450451

451-
float slot_prompt_similarity = 0.5f;
452+
float slot_prompt_similarity = 0.1f;
452453

453454
// batched-bench params
454455
bool is_pp_shared = false;

ggml/include/ggml-backend.h

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,8 @@ extern "C" {
132132
GGML_BACKEND_DEVICE_TYPE_CPU,
133133
// GPU device using dedicated memory
134134
GGML_BACKEND_DEVICE_TYPE_GPU,
135+
// integrated GPU device using host memory
136+
GGML_BACKEND_DEVICE_TYPE_IGPU,
135137
// accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
136138
GGML_BACKEND_DEVICE_TYPE_ACCEL
137139
};
@@ -150,11 +152,21 @@ extern "C" {
150152

151153
// all the device properties
152154
struct ggml_backend_dev_props {
155+
// device name
153156
const char * name;
157+
// device description
154158
const char * description;
159+
// device free memory in bytes
155160
size_t memory_free;
161+
// device total memory in bytes
156162
size_t memory_total;
163+
// device type
157164
enum ggml_backend_dev_type type;
165+
// device id
166+
// for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
167+
// if the id is unknown, this should be NULL
168+
const char * device_id;
169+
// device capabilities
158170
struct ggml_backend_dev_caps caps;
159171
};
160172

ggml/src/ggml-backend-impl.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
extern "C" {
99
#endif
1010

11-
#define GGML_BACKEND_API_VERSION 1
11+
#define GGML_BACKEND_API_VERSION 2
1212

1313
//
1414
// Backend buffer type

ggml/src/ggml-backend-reg.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -400,9 +400,8 @@ ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const
400400

401401
ggml_backend_t ggml_backend_init_best(void) {
402402
ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
403-
if (!dev) {
404-
dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
405-
}
403+
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_IGPU);
404+
dev = dev ? dev : ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
406405
if (!dev) {
407406
return nullptr;
408407
}

ggml/src/ggml-cuda/common.cuh

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -560,7 +560,7 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const float2 v
560560
}
561561

562562
static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v, const half2 u) {
563-
#if defined(GGML_USE_HIP) && defined(GCN)
563+
#if defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(__gfx906__) || defined(CDNA))
564564
asm volatile("v_dot2_f32_f16 %0, %1, %2, %0" : "+v"(acc) : "v"(v), "v"(u));
565565
#else
566566
#ifdef FAST_FP16_AVAILABLE
@@ -572,7 +572,21 @@ static __device__ __forceinline__ void ggml_cuda_mad(float & acc, const half2 v,
572572
acc += tmpv.x * tmpu.x;
573573
acc += tmpv.y * tmpu.y;
574574
#endif // FAST_FP16_AVAILABLE
575-
#endif // defined(GGML_USE_HIP) && defined(GCN)
575+
#endif // defined(GGML_USE_HIP) && (defined(RDNA2) || defined(RDNA3) || defined(RDNA4) || defined(GCN5) || defined(CDNA))
576+
}
577+
578+
// Aligned memory transfers of 8/16 bytes can be faster than 2 transfers with 4 bytes, especially on AMD.
579+
template <int nbytes>
580+
static __device__ __forceinline__ void ggml_cuda_memcpy_1(void * __restrict__ dst, const void * __restrict__ src) {
581+
if constexpr (nbytes == 4) {
582+
*(int *) dst = *(const int *) src;
583+
} else if constexpr (nbytes == 8) {
584+
*(int2 *) dst = *(const int2 *) src;
585+
} else if constexpr (nbytes == 16) {
586+
*(int4 *) dst = *(const int4 *) src;
587+
} else {
588+
static_assert(nbytes == 0 && nbytes == -1, "bad nbytes");
589+
}
576590
}
577591

578592
static __device__ __forceinline__ float ggml_cuda_e8m0_to_fp32(uint8_t x) {

0 commit comments

Comments
 (0)