diff --git a/examples/c/CMakeLists.txt b/examples/c/CMakeLists.txt index 3ff3c5247..76047cecb 100644 --- a/examples/c/CMakeLists.txt +++ b/examples/c/CMakeLists.txt @@ -6,6 +6,7 @@ set(CMAKE_CXX_STANDARD 20) option(USE_CUDA "Build with CUDA support" OFF) option(USE_CXX "Invoke the C++ example" ON) option(PHI3 "Build the Phi example" OFF) +option(LLAMA "Build the Llama example" OFF) option(PHI3V "Build the Phi3v example" OFF) option(WHISPER "Build the Whisper example" OFF) @@ -58,6 +59,11 @@ if(PHI3) prepare_executable(phi3) endif() +if(LLAMA) + add_executable(llama ${CMAKE_SOURCE_DIR}/src/llama.cpp) + prepare_executable(llama) +endif() + if(PHI3V) add_executable(phi3v ${CMAKE_SOURCE_DIR}/src/phi3v.cpp) prepare_executable(phi3v) diff --git a/examples/c/README.md b/examples/c/README.md index 0fd01018d..e80b1eb46 100644 --- a/examples/c/README.md +++ b/examples/c/README.md @@ -221,15 +221,125 @@ Change into the onnxruntime-genai directory. 2. Build onnxruntime-genai from source and install - This example requires onnxruntime-genai to be built from source. + ```bash + curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-linux-cpu-x64-capi.zip -o onnxruntime-genai-linux-cpu-x64-capi.zip + unzip onnxruntime-genai-linux-cpu-x64-capi.zip + cd onnxruntime-genai-linux-cpu-x64-capi + tar xvf onnxruntime-genai-0.4.0-linux-x64.tar.gz + cp onnxruntime-genai-0.4.0-linux-x64/include/* ../include + cp onnxruntime-genai-0.4.0-linux-x64/lib/* ../lib + cd .. + ``` + +#### Build this sample + +Build with CUDA: + +```bash +cmake . -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON +cd build +cmake --build . --config Release +``` + +Build for CPU: + +```bash +cmake . -B build -DPHI3=ON +cd build +cmake --build . --config Release +``` + +#### Run the sample + +```bash +./phi3 path_to_model +``` + +## Llama + +### Obtain model + +To access Llama models, you need to sign the license agreement on HuggingFace. Navigate to the model on HuggingFace e.g. https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct and sign the license agreement. + +Once you have been granted access, run the following steps to generate the ONNX model in the precision and for the target that you want to run on. Note: this operations requires 64GB of RAM to complete. + +```bash +pip install torch transformers onnx onnxruntime onnxruntime-genai huggingface-hub[cli] +huggingface-cli login +python onnxruntime_genai.models.builder -m meta-llama/Llama-3.1-8B-Instruct -e cpu -p int4 -o llama-3.1-8b-instruct-onnx +``` + +The models and all of the necessary meta data will be available in a folder called `llama-3.1-8b-instruct-onnx`. + +### Windows x64 CPU + +#### Install the onnxruntime and onnxruntime-genai binaries + +Change into the `onnxruntime-genai\examples\c` folder. + +1. Install onnxruntime + + ```cmd + curl -L https://github.com/microsoft/onnxruntime/releases/download/v1.19.2/onnxruntime-win-x64-1.19.2.zip -o onnxruntime-win-x64-1.19.2.zip + tar xvf onnxruntime-win-x64-1.19.2.zip + copy onnxruntime-win-x64-1.19.2\include\* include + copy onnxruntime-win-x64-1.19.2\lib\* lib + ``` + +2. Install onnxruntime-genai + + ```cmd + curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-win-cpu-x64-capi.zip -o onnxruntime-genai-win-cpu-x64-capi.zip + tar xvf onnxruntime-genai-win-cpu-x64-capi.zip + cd onnxruntime-genai-win-cpu-x64-capi + tar xvf onnxruntime-genai-0.4.0-win-x64.zip + copy onnxruntime-genai-0.4.0-win-x64\include\* ..\include + copy onnxruntime-genai-0.4.0-win-x64\lib\* ..\lib + cd .. + ``` + +#### Build this sample + +```bash +cmake -A x64 -S . -B build -DLLAMA=ON +cd build +cmake --build . --config Release +``` + +#### Run the sample + +```bash +cd Release +.\llama.exe llama-3.1-8b-instruct-onnx +``` + +### Linux + +#### Install the onnxruntime and onnxruntime-genai binaries + +Change into the onnxruntime-genai directory. + +1. Install onnxruntime ```bash - # This should be run from the root of the onnxruntime-genai folder - python build.py --config Release --ort_home examples\c - cp src/ort_genai.h examples/c/include - cp src/ort_genai_c.h examples/c/include - cp build/Linux/release/onnxruntime-genai.so examples/c/lib cd examples/c + curl -L https://github.com/microsoft/onnxruntime/releases/download/v1.19.2/onnxruntime-linux-x64-1.19.2.tgz -o onnxruntime-linux-x64-1.19.2.tgz + tar xvzf onnxruntime-linux-x64-1.19.2.tgz + cp onnxruntime-linux-x64-1.19.2/include/* include + cp onnxruntime-linux-x64-1.19.2/lib/* lib + cd ../.. + ``` + +2. Build onnxruntime-genai from source and install + + ```bash + curl -L https://github.com/microsoft/onnxruntime-genai/releases/download/v0.4.0/onnxruntime-genai-linux-cpu-x64-capi.zip -o onnxruntime-genai-linux-cpu-x64-capi.zip + unzip onnxruntime-genai-linux-cpu-x64-capi.zip + cd onnxruntime-genai-linux-cpu-x64-capi + tar xvf onnxruntime-genai-0.4.0-linux-x64.tar.gz + cp onnxruntime-genai-0.4.0-linux-x64/include/* ../include + cp onnxruntime-genai-0.4.0-linux-x64/lib/* ../lib + cd .. ``` #### Build this sample @@ -237,16 +347,15 @@ Change into the onnxruntime-genai directory. Build with CUDA: ```bash -mkdir build +cmake . -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON cd build -cmake ../ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CUDA_ARCHITECTURES=80 -DUSE_CUDA=ON -DPHI3=ON cmake --build . --config Release ``` Build for CPU: ```bash -cmake . -DPHI3=ON +cmake . -B build -DPHI3=ON cd build cmake --build . --config Release ``` @@ -254,10 +363,10 @@ cmake --build . --config Release #### Run the sample ```bash -cd Release -./phi3 path_to_model +./llama path_to_model ``` + ## Phi-3 vision ### Download model diff --git a/examples/c/include/.gitkeep b/examples/c/include/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/c/lib/.gitkeep b/examples/c/lib/.gitkeep deleted file mode 100644 index e69de29bb..000000000 diff --git a/examples/c/src/llama.cpp b/examples/c/src/llama.cpp new file mode 100644 index 000000000..d8a8bd7c7 --- /dev/null +++ b/examples/c/src/llama.cpp @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include +#include +#include "ort_genai.h" + +// C++ API Example + +void CXX_API(const char* model_path) { + std::cout << "Creating model..." << std::endl; + auto model = OgaModel::Create(model_path); + std::cout << "Creating tokenizer..." << std::endl; + auto tokenizer = OgaTokenizer::Create(*model); + auto tokenizer_stream = OgaTokenizerStream::Create(*tokenizer); + + while (true) { + std::string text; + std::cout << "Prompt: " << std::endl; + std::getline(std::cin, text); + + const std::string prompt = "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\nYou are a helpful AI assistant. Give a short answer to the following<|eot_id|><|start_header_id|>user<|end_header_id|>" + text + "<|eot_id|><|start_header_id|>assistant<|end_header_id|>"; + + auto sequences = OgaSequences::Create(); + tokenizer->Encode(prompt.c_str(), *sequences); + + std::cout << "Generating response..." << std::endl; + auto params = OgaGeneratorParams::Create(*model); + params->SetSearchOption("max_length", 1024); + params->SetSearchOptionBool("do_sample", true); + params->SetInputSequences(*sequences); + + auto generator = OgaGenerator::Create(*model, *params); + + while (!generator->IsDone()) { + generator->ComputeLogits(); + generator->GenerateNextToken(); + + // Show usage of GetOutput + std::unique_ptr output_logits = generator->GetOutput("logits"); + + // Assuming output_logits.Type() is float as it's logits + // Assuming shape is 1 dimensional with shape[0] being the size + auto logits = reinterpret_cast(output_logits->Data()); + + // Print out the logits using the following snippet, if needed + //auto shape = output_logits->Shape(); + //for (size_t i=0; i < shape[0]; i++) + // std::cout << logits[i] << " "; + //std::cout << std::endl; + + const auto num_tokens = generator->GetSequenceCount(0); + const auto new_token = generator->GetSequenceData(0)[num_tokens - 1]; + std::cout << tokenizer_stream->Decode(new_token) << std::flush; + } + + for (int i = 0; i < 3; ++i) + std::cout << std::endl; + } +} + + +static void print_usage(int /*argc*/, char** argv) { + std::cerr << "usage: " << argv[0] << " model_path" << std::endl; +} + +int main(int argc, char** argv) { + if (argc != 2) { + print_usage(argc, argv); + return -1; + } + + // Responsible for cleaning up the library during shutdown + OgaHandle handle; + + std::cout << "-------------" << std::endl; + std::cout << "Run Llama" << std::endl; + std::cout << "-------------" << std::endl; + +#ifdef USE_CXX + std::cout << "C++ API" << std::endl; + CXX_API(argv[1]); +#endif + + return 0; +} \ No newline at end of file