diff --git a/README.md b/README.md
index 5c91169b0ca65..e8bfc25c4e72a 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,3 @@
-# create-svelte
-
-Everything you need to build a Svelte project, powered by [`create-svelte`](https://github.com/sveltejs/kit/tree/master/packages/create-svelte).
-
## Creating a project
If you're seeing this, you've probably already done this step. Congrats!
diff --git a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png
new file mode 100644
index 0000000000000..4646c06045023
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-13B-E2E-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png
new file mode 100644
index 0000000000000..eecce4751cbd0
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png
new file mode 100644
index 0000000000000..b8cdf011dfec3
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-13B-Prompt-Latency.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png
new file mode 100644
index 0000000000000..814a8e6d81a02
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure2-LLaMA-2-7B-Prompt-Latency 1.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png
new file mode 100644
index 0000000000000..b5ea1e1c5f918
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-13B-Tokens-Generated-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png
new file mode 100644
index 0000000000000..0bc90a464bc71
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure3-LLaMA-2-7B-Tokens-Generated-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/Figure4-LLaMA-2-70B-Model-Throughput.png b/src/images/blogs/accelerating-llama-2/Figure4-LLaMA-2-70B-Model-Throughput.png
new file mode 100644
index 0000000000000..acd993602a032
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/Figure4-LLaMA-2-70B-Model-Throughput.png differ
diff --git a/src/images/blogs/accelerating-llama-2/LLaMA-2OptimizationDiagram-5.png b/src/images/blogs/accelerating-llama-2/LLaMA-2OptimizationDiagram-5.png
new file mode 100644
index 0000000000000..ed1a27409cc8c
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/LLaMA-2OptimizationDiagram-5.png differ
diff --git a/src/images/blogs/accelerating-llama-2/LLaMAWindowsExportRotaryEmbeddingSubgraph-6.jpg b/src/images/blogs/accelerating-llama-2/LLaMAWindowsExportRotaryEmbeddingSubgraph-6.jpg
new file mode 100644
index 0000000000000..eae3d79510289
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/LLaMAWindowsExportRotaryEmbeddingSubgraph-6.jpg differ
diff --git a/src/images/blogs/accelerating-llama-2/RotaryEmbeddingFunctionExample-7.png b/src/images/blogs/accelerating-llama-2/RotaryEmbeddingFunctionExample-7.png
new file mode 100644
index 0000000000000..033c4c08c54f6
Binary files /dev/null and b/src/images/blogs/accelerating-llama-2/RotaryEmbeddingFunctionExample-7.png differ
diff --git a/src/images/blogs/hugging-face-blog-img.png b/src/images/blogs/hugging-face-blog-img.png
new file mode 100644
index 0000000000000..2ffe02b6956e8
Binary files /dev/null and b/src/images/blogs/hugging-face-blog-img.png differ
diff --git a/src/images/blogs/pytorch-on-the-edge-puppies.png b/src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-puppies.png
similarity index 100%
rename from src/images/blogs/pytorch-on-the-edge-puppies.png
rename to src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-puppies.png
diff --git a/src/images/blogs/pytorch-on-the-edge-speechrec.png b/src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-speechrec.png
similarity index 100%
rename from src/images/blogs/pytorch-on-the-edge-speechrec.png
rename to src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-speechrec.png
diff --git a/src/images/blogs/pytorch-on-the-edge-textgen.png b/src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-textgen.png
similarity index 100%
rename from src/images/blogs/pytorch-on-the-edge-textgen.png
rename to src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-textgen.png
diff --git a/src/images/blogs/pytorch-on-the-edge-training.png b/src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-training.png
similarity index 100%
rename from src/images/blogs/pytorch-on-the-edge-training.png
rename to src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-training.png
diff --git a/src/images/blogs/pytorch-on-the-edge-with-ort.png b/src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-with-ort.png
similarity index 100%
rename from src/images/blogs/pytorch-on-the-edge-with-ort.png
rename to src/images/blogs/pytorch-on-the-edge/pytorch-on-the-edge-with-ort.png
diff --git a/src/routes/blogs/+page.svelte b/src/routes/blogs/+page.svelte
index fa2d0c6eb61f6..0aeb85a57e6a8 100644
--- a/src/routes/blogs/+page.svelte
+++ b/src/routes/blogs/+page.svelte
@@ -6,6 +6,8 @@
import anime from 'animejs';
import { onMount } from 'svelte';
import ImageBlogs from '../../images/undraw/image_blogs.svelte';
+ import HFImage from '../../images/blogs/hugging-face-blog-img.png';
+ import LlamaImage from '../../images/blogs/accelerating-llama-2/Figure1-LLaMA-2-7B-E2E-Throughput.png';
onMount(() => {
anime({
targets: '.border-primary',
@@ -21,21 +23,43 @@
});
});
let featuredblog = [
+ {
+ title: 'Accelerating LLaMA-2 Inference with ONNX Runtime',
+ date: 'November 14th, 2023',
+ blurb: 'Learn how ONNX Runtime can speed up LLaMA-2 inference by up to 4.5X',
+ link: 'blogs/accelerating-llama-2',
+ image: LlamaImage,
+ imgalt: 'LLaMA-2 e2e throughput'
+ },
{
title: 'Run PyTorch models on the edge',
date: 'October 12th, 2023',
blurb:
'Everything you need to know about running PyTorch models on the edge with ONNX Runtime.',
link: 'blogs/pytorch-on-the-edge',
- image: 'https://onnxruntime.ai/_app/immutable/assets/pytorch-on-the-edge-with-ort.cdaa9c84.png'
+ image:
+ 'https://onnxruntime.ai/_app/immutable/assets/pytorch-on-the-edge-with-ort.cdaa9c84.png',
+ imgalt: 'Run PyTorch models on the edge'
},
+ {
+ title: 'Accelerating over 130,000 Hugging Face models with ONNX Runtime',
+ date: 'October 4th, 2023',
+ blurb:
+ 'Learn more on how ONNX Runtime helps users accelerate open source machine learning models from Hugging Face.',
+ link: 'https://cloudblogs.microsoft.com/opensource/2023/10/04/accelerating-over-130000-hugging-face-models-with-onnx-runtime/',
+ image: HFImage,
+ imgalt: 'Hugging Face models with ONNX Runtime'
+ }
+ ];
+ let blogs = [
{
title: 'On-Device Training with ONNX Runtime: A deep dive',
date: 'July 5th, 2023',
blurb:
'This blog presents technical details of On-Device training with ONNX Runtime. It explains how On-Device Training works and what are the different steps and artifacts involved in the training process. This information will help you train your models on edge devices.',
link: 'https://cloudblogs.microsoft.com/opensource/2023/07/05/on-device-training-with-onnx-runtime-a-deep-dive/',
- image: 'https://cloudblogs.microsoft.com/opensource/wp-content/uploads/sites/37/2023/06/Open-Source-1.webp'
+ image:
+ 'https://cloudblogs.microsoft.com/opensource/wp-content/uploads/sites/37/2023/06/Open-Source-1.webp'
},
{
title:
@@ -45,9 +69,7 @@
'Learn how ONNX Runtime accelerates Whisper and makes it easy to deploy on desktop, mobile, in the cloud, and even in the browser.',
link: 'https://medium.com/microsoftazure/build-and-deploy-fast-and-portable-speech-recognition-applications-with-onnx-runtime-and-whisper-5bf0969dd56b',
image: 'https://miro.medium.com/v2/resize:fit:1100/format:webp/1*DJH8_6GS06-N32tkVhdTOw.png'
- }
- ];
- let blogs = [
+ },
{
title: 'On-Device Training: Efficient training on the edge with ONNX Runtime',
date: 'May 31st, 2023',
diff --git a/src/routes/blogs/accelerating-llama-2/+page.svelte b/src/routes/blogs/accelerating-llama-2/+page.svelte
new file mode 100644
index 0000000000000..0f5add02a8d4b
--- /dev/null
+++ b/src/routes/blogs/accelerating-llama-2/+page.svelte
@@ -0,0 +1,264 @@
+
+
+
By: Kunal Vaishnavi and Parinita Rahi
+14TH NOVEMBER, 2023
++ Interested in running Llama2 faster? Let us explore how ONNX Runtime can propel your Llama2 + variants for faster inference! +
+ ++ You can now experience significant inference gains—up to 4X faster—for the 7B, 13B, and 70B + models, thanks to state-of-the-art fusion and kernel optimizations with ONNX Runtime. This + blog details performance enhancements, dives into ONNX Runtime fusion optimizations, multi-GPU + inferencing support, and guides you on how to leverage the cross-platform prowess of ONNX + Runtime for seamless inferencing across platforms. This is the first in a series of upcoming + blogs that will cover additional aspects for efficient memory usage with ONNX Runtime + quantization updates, and cross-platform usage scenarios. +
+ ++ Llama2 is a state-of-the-art open source LLM from Meta ranging in scale from 7B to 70B + parameters (7B, 13B, 70B). Microsoft and Meta announced their AI on Azure and Windows + collaboration in July 2023. As part of the announcement, Llama2 was added to the Azure AI + model catalog, which serves as a hub of foundation models that empower developers and machine + learning (ML) professionals to easily discover, evaluate, customize, and deploy pre-built + large AI models at scale. +
+ ++ ONNX Runtime allows users to easily integrate the power of this generative AI model into your + apps and services with improved optimizations that yield faster inferencing speeds and lower + your costs. +
+ ++ As part of the new 1.16.2 release, ONNX Runtime now has several built-in optimizations for + Llama2, including graph fusions and kernel optimizations. The inference speedups, when + compared to Hugging Face (HF) variants of Llama2 in PyTorch compile mode for prompt latency of + CUDA FP16, are mentioned below. We see ~3X gains in end-to-end throughput comparisons for both + 7B and 13B models. The end-to-end throughput or wall-clock throughput shown below is defined + as batch size * (prompt length + token generation length) / wall-clock latency where + wall-clock latency = the latency from running end-to-end and token generation length = 256 + generated tokens. The E2E throughput is up to 4.5X more when compared to PyTorch compile. +
++ The graphs below show latency comparisons between the ONNX Runtime and PyTorch variants of the Llama2 + 7B model on CUDA FP16. Latency here is defined as the time it takes to complete one pass + through the model to produce the logits and synchronize the outputs. +
+ ++ Token generation throughput below is the average throughput of the first 128 tokens generated. + We see up to 3.5X gains in token generation throughput when compared to PyTorch eager and + compile modes. +
+ +More details on these metrics can be found here.
+ ++ ONNX Runtime supports multi-GPU inference to enable serving large models. Even in FP16 precision, + the LLaMA-2 70B model requires 140GB. Loading the model requires multiple GPUs + for inference, even with a powerful NVIDIA A100 80GB GPU. +
+ ++ ONNX Runtime applied Megatron-LM Tensor Parallelism on the 70B model to split the + original model weight onto different GPUs. Megatron sharding on the 70B model + shards the PyTorch model with FP16 precision into 4 partitions, converts each partition into ONNX + format, and then applies a new ONNX Runtime graph fusion on the converted ONNX model. The 70B + model has ~30 tokens per second throughput for token generation at batch size 1, and + end-to-end throughput starts at 30 ms for smaller sequence lengths with these optimizations. + You can find additional example scripts here. +
+ + + ++ The techniques that ONNX Runtime uses for optimizations, such as graph fusions, are applicable + to state-of-the-art models. As these models become more complex, the techniques used to apply + the graph fusions are adapted to accommodate the extra complexity. For example, instead of + manually matching fusion patterns in the graph, ONNX Runtime now supports automated pattern + matching. Rather than detect large subgraphs by hand and match the many paths they form, + fusion opportunities can instead be identified by exporting a large module as a function and + then pattern matching against a function's spec. +
+ + ++ As a concrete example, Figure 6 is an example of the nodes that comprise rotary embedding + computations. Pattern matching against this subgraph is cumbersome because of the number of + paths to verify. By exporting this as a function, the parent view of the graph will only show + the inputs and outputs and represent all these nodes as a single operator. +
+ + + ++ This approach makes it easier to maintain and support future versions of the rotary embedding + computations because the pattern matching is only dependent on the operator's inputs and + outputs instead of its internal semantic representation. It also allows other existing + implementations of rotary embeddings in similar models such as GPT-NeoX, Falcon, Mistral, + Zephyr, etc. to be pattern matched and fused with minimal or no changes. +
+ ++ ONNX Runtime also adds support for the GroupQueryAttention (GQA) operator, which leverages the new + Flash Attention V2 algorithm and its optimized kernels to efficiently compute attention. The + GQA operator supports past-present buffer sharing between the past key/value cache (past KV + cache) and the present key/value cache (present KV cache). By binding the present KV caches to + the past KV caches, there is no need to allocate separate on-device memory for both caches. + Instead, the past KV caches can be pre-allocated with enough on-device memory so that no new + on-device memory needs to be requested during inference. This reduces memory usage when the KV + caches become large during compute-intensive workloads and lowers latency by eliminating + on-device memory allocation requests. The past-present buffer sharing can be enabled or + disabled without needing to change the ONNX model, allowing greater flexibility for end users + to decide which approach is best for them. +
+ ++ In addition to these fusions and kernel optimizations, ONNX Runtime reduces the model’s memory usage. + Besides quantization improvements (which will be covered in a future post), ONNX Runtime compresses the + size of the cosine and sine caches used in each of the rotary embeddings by 50%. The compute kernels in + ONNX Runtime that run the rotary embedding computations can then recognize this format and use their + parallelized implementations to calculate the rotary embeddings more efficiently with less memory usage. + The rotary embedding compute kernels also support interleaved and non-interleaved formats to support both + the Microsoft version of LLaMA-2 + and the Hugging Face version of LLaMA-2 respectively while sharing the same calculations. +
+ ++ The optimizations work for the Hugging Face versions (models ending with -hf) and the Microsoft versions. + You can download the optimized HF versions from Microsoft's LLaMA-2 ONNX repository. Stay tuned for + newer Microsoft versions coming soon! +
+ ++ Olive is a hardware-aware model optimization tool that incorporates advanced techniques such + as model compression, optimization, and compilation. We have made ONNX Runtime optimizations + available through Olive so you can streamline the entire optimization process for a given + hardware with simple experience. +
+ ++ Here is an example of Llama2 optimization with Olive, which harnesses ONNX Runtime + optimizations highlighted in this blog. Distinct optimization flows cater to various + requirements. For instance, you have the flexibility to choose different data types for + quantization in CPU and GPU inference, based on your accuracy tolerance. Additionally, you can + fine-tune your own Llama2 model with Olive-QLoRa on client GPUs and perform inference with + ONNX Runtime optimizations. +
+ ++ Here is a sample notebook that shows you an end-to-end example of how you can use the above + ONNX Runtime optimizations in your application. +
+ ++ The advancements discussed in this blog provide faster Llama2 inferencing with ONNX Runtime, + offering exciting possibilities for AI applications and research. With improved performance + and efficiency, the horizon is wide open for innovation, and we eagerly await new applications + built with Llama2 and ONNX Runtime by its vibrant community of developers. Stay tuned for more + updates! +
+{description}
- +12TH OCTOBER, 2023
-Most modern ML models are developed with PyTorch. The agility and flexibility that PyTorch