-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconvolutions_part2.html
182 lines (182 loc) · 12.2 KB
/
convolutions_part2.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
<!doctype html>
<html lang="en" class="no-js">
<head>
<meta charset="utf-8">
<!-- begin SEO -->
<title>Accelerating Image Filters - Part 2</title>
<meta name="description" content="">
<meta property="og:locale" content="en">
<meta property="og:site_name" content="Home">
<meta property="og:title" content="Accelerating Image Filters - Part 2">
<link rel="canonical" href=alankelly.github.io/convolutions_part2.html/">
<meta property="og:url" content="alankelly.github.io/convolutions_part2.html/">
<meta property="og:description" content="">
<meta property="og:image" content="assets/images/first_photo.jpg">
<meta property="og:type" content="article">
<meta property="article:published_time" content="2019-04-23T00:00:00+00:00">
<script type="application/ld+json">
{
"@context" : "http://schema.org",
"@type" : "Person",
"name" : "Alan Kelly",
"url" : "//alankelly.github.io"
}
</script>
<!-- end SEO -->
<!-- http://t.co/dKP3o1e -->
<meta name="HandheldFriendly" content="True">
<meta name="MobileOptimized" content="320">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<script>
document.documentElement.className = document.documentElement.className.replace(/\bno-js\b/g, '') + ' js ';
</script>
<!-- For all browsers -->
<link rel="stylesheet" href="assets/css/main.css">
<meta http-equiv="cleartype" content="on">
<!-- start custom head snippets -->
<!-- insert favicons. use http://realfavicongenerator.net/ -->
<!-- end custom head snippets -->
<!-- Begin Jekyll SEO tag v2.5.0 -->
<title>Accelerating Image Filters - Part 2 | Home</title>
<meta name="generator" content="Jekyll v3.8.5" />
<meta property="og:title" content="Accelerating Image Filters - Part 2" />
<meta name="author" content="Alan Kelly" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="Personal Website and Blog." />
<meta property="og:description" content="Personal Website and Blog." />
<link rel="canonical" href=//alankelly.github.io/convolutions_part2.html" />
<meta property="og:url" content="//alankelly.github.io/convolutions_part2.html/" />
<meta property="og:site_name" content="Home" />
<meta property="og:type" content="article" />
<meta property="article:published_time" content="2019-04-23T00:00:00+00:00" />
<meta name="twitter:card" content="summary" />
<script type="application/ld+json">
{"author":{"@type":"Person","name":"Alan Kelly"},"@type":"BlogPosting","mainEntityOfPage":{"@type":"WebPage","@id":"convolutions_part2.html"},"url":"convolutions_part2.html/","headline":"Accelerating Image Filters - Part 2","dateModified":"2019-04-23T00:00:00+00:00","datePublished":"2019-04-23T00:00:00+00:00","description":"Personal Website and Blog.","@context":"http://schema.org"}
</script>
<!-- End Jekyll SEO tag -->
<link rel="icon" href="assets/images/icon.png">
<style>
body {
margin-bottom: 100px;
}
</style>
</head>
<body class="layout--single">
<!--[if lt IE 9]>
<div class="notice--danger align-center" style="margin: 0;">You are using an <strong>outdated</strong> browser. Please <a href="http://browsehappy.com/">upgrade your browser</a> to improve your experience.</div>
<![endif]-->
<div class="masthead">
<div class="masthead__inner-wrap">
<div class="masthead__menu">
<nav id="site-nav" class="greedy-nav">
<ul class="visible-links">
<li class="masthead__menu-item masthead__menu-item--lg"><a href="//alankelly.github.io/">Home</a></li>
<li class="masthead__menu-item"><a href="//alankelly.github.io/blog">Blog</a></li>
<li class="masthead__menu-item"><a href="//alankelly.github.io/talks">Talks</a></li>
<li class="masthead__menu-item"><a href="//alankelly.github.io/cv/AlanKelly.pdf">CV</a></li>
</ul>
<ul class="hidden-links hidden"></ul>
</nav>
</div>
</div>
</div>
<div class="page__hero--overlay"
style="background-color: #000; background-image: linear-gradient(rgba(0, 0, 0, 0.5), rgba(0, 0, 0, 0.5)), url('assets/images/first_photo.jpg');"
>
<div class="wrapper">
<h1 class="page__title" itemprop="headline">
Accelerating Image Filters - Part 2
</h1>
<p class="page__lead"><br /><br /><br /></p>
</div>
</div>
<div id="main" role="main">
<div class="sidebar sticky">
<div itemscope itemtype="http://schema.org/Person">
<div class="author__avatar">
<img src="assets/images/me.png" class="author__avatar" alt="Alan Kelly" itemprop="image">
</div>
<div class="author__content">
<h3 class="author__name" itemprop="name">Alan Kelly</h3>
<p class="author__bio" itemprop="description">
Software Performance Expert
</p>
</div>
<div class="author__urls-wrapper">
<button class="btn btn--inverse">Follow</button>
<ul class="author__urls social-icons">
<li>
<li><a href="https://www.linkedin.com/in/alanjkelly" itemprop="sameAs" target="_blank">
<i class="fa fa-fw fa-linkedin-square" aria-hidden="true"></i>
LinkedIn
</a>
</li>
<li><a href="https://github.com/alankelly" itemprop="sameAs" target="_blank">
<i class="fa fa-fw fa-github" aria-hidden="true"></i>
Github
</a>
</li>
</ul>
</div>
</div>
</div>
<article class="page" itemscope itemtype="http://schema.org/CreativeWork">
<meta itemprop="headline" content="Accelerating Image Filters - Part 2">
<meta itemprop="description" content="">
<meta itemprop="datePublished" content="April 23, 2019">
<div class="page__inner-wrap">
<section class="page__content" itemprop="text">
<p>In this post I will examine the half precision floating point version of NEBox3x3. You can read all about half precision floats <a href="https://en.wikipedia.org/wiki/Half-precision_floating-point_format" target="_blank">here</a>. The latest Arm processors optionally support 16 floating point operations - Armv8.2-a.
<p>In my previous <a href="https://www.alankelly.github.io/convolutions_part1" target="_blank">post</a> I wrote about replacing the 32 bit floating point operations with fixed point operations in the version of this function for processors which don't have the required instructions for half precision float.</p>
<p>As we saw in the previous post, converting from integer to float and back again is vary expensive. The original code is shown here:</p>
<script src="https://gist.github.com/alankelly/8e95fc6f75c62d1149f24a43e24408c6.js"></script>
<p>We see here each register is converted immediately to float16 when it's loaded. All of the calculation is done using float16. We saw in my previous post that these conversions are expensive, can we get rid of them somehow? Yes we can! We load the data without converting it to float16, we only do the conversion to uint16. We only convert at the end when we need to scale the result by dividing by nine. We modify the loads:</p>
<script src="https://gist.github.com/alankelly/730c17ccb5c967360a1ae3c54338d90e.js"></script>
<p>And we calculate the result using uint16. We then convert the register to float16. Unlike the previous example, we don't need to split the register in two and convert each half to float - the converted values in this case fit into one register:</p>
<script src="https://gist.github.com/alankelly/9c9483bde288d90547e450681eabe630.js"></script>
<p>So is this quicker than the original code? And is this quicker than the original version? It's time to test!</p>
<p>Something's wrong, the original float16 code is twice as slow as the original integer code. To find out what's going on, we're going to examine the generated assembly. I extracted the relevant part of the generated code, where the register is converted from uint16 to float16.</p>
<script src="https://gist.github.com/alankelly/ab97203104752085a5c2a6dc1c46ea71.js"></script>
<p>There are six instructions generated here, it looks like the lower and upper halves of the register are being converted to int32, then converted to float32 and then converted to float16. This seems a rather roundabout way off doing things! I know there's an instruction to convert uint16 directly to float16. This looks like a compiler bug. To test this, I'll use godbolt Compiler Explorer which is an interactive online compiler which shows the assembly output of compiled C++ using various compilers. I'll try the latest gcc - version 8.2. Let's see what this generates:</p>
<script src="https://gist.github.com/alankelly/efacc2c075d55494872cadf1aff10a07.js"></script>
<p>One instruction! This is definitely a compiler bug we've stumbled upon. Time to open a <a href="https://github.com/android-ndk/ndk/issues/957" target="_blank">bug report</a> </p>
<p>Since we have a compiler bug, we can't get accurate runtimes, however, even the figures above show that this is a very worthwhile optimisation. With the compiler bug, it's impossible to tell whether the integer fixed point version or the float16 version is best. But the good news is that both versions have been optimised and the compiler bug has been reported.</p>
</section>
<footer class="page__meta">
<p class="page__date"><strong><i class="fa fa-fw fa-calendar" aria-hidden="true"></i> Updated:</strong> <time datetime="2019-04-23T00:00:00+00:00">April 23, 2019</time></p>
</footer>
<section class="page__share">
<h4 class="page__share-title">Share on</h4>
<!-- <a href="https://twitter.com/intent/tweet?via=alanjkelly&text=Accelerating Image Filters - Part 2 //alankelly.github.io/convolutions_part2.html" class="btn btn--twitter" title="Share on Twitter"><i class="fa fa-fw fa-twitter" aria-hidden="true"></i><span> Twitter</span></a> -->
<a href="https://www.linkedin.com/shareArticle?mini=true&url=https://alankelly.github.io/convolutions_part2.html" target="_blank" class="btn btn--linkedin" title="Share on LinkedIn"><i class="fa fa-fw fa-linkedin" aria-hidden="true"></i><span> LinkedIn</span></a>
</section>
</div>
</article>
</div>
<div class="page__footer">
<footer>
<!-- start custom footer snippets -->
<!-- end custom footer snippets -->
<div class="page__footer-copyright">
© 2019 Alan Kelly.
<a href="https://www.linkedin.com/in/alanjkelly" itemprop="sameAs" target="_blank"><i class="fa fa-fw fa-linkedin-square" aria-hidden="true"></i>LinkedIn</a>
<a href="https://github.com/alankelly" itemprop="sameAs" target="_blank"><i class="fa fa-fw fa-github" aria-hidden="true"></i>Github</a>
</div>
</footer>
</div>
<script type="text/javascript" src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.1/MathJax.js?config=TeX-AMS-MML_HTMLorMML"></script>
<script type="text/javascript" src="assets/js/main.min.js"></script>
<!-- Global Site Tag (gtag.js) - Google Analytics -->
<!-- Google Analytics -->
<script>
(function(i,s,o,g,r,a,m){i['GoogleAnalyticsObject']=r;i[r]=i[r]||function(){
(i[r].q=i[r].q||[]).push(arguments)},i[r].l=1*new Date();a=s.createElement(o),
m=s.getElementsByTagName(o)[0];a.async=1;a.src=g;m.parentNode.insertBefore(a,m)
})(window,document,'script','https://www.google-analytics.com/analytics.js','ga');
ga('create', 'UA-139124337-1', 'auto');
ga('send', 'pageview');
</script>
<!-- End Google Analytics -->
</body>
</html>