tiny_dnn 1.0.0
A header only, dependency-free deep learning framework in C++11
Loading...
Searching...
No Matches
tiny_quantization_kernel.h
1/*
2 Copyright (c) 2016, Taiga Nomi, Edgar Riba
3 All rights reserved.
4
5 Redistribution and use in source and binary forms, with or without
6 modification, are permitted provided that the following conditions are met:
7 * Redistributions of source code must retain the above copyright
8 notice, this list of conditions and the following disclaimer.
9 * Redistributions in binary form must reproduce the above copyright
10 notice, this list of conditions and the following disclaimer in the
11 documentation and/or other materials provided with the distribution.
12 * Neither the name of the <organization> nor the
13 names of its contributors may be used to endorse or promote products
14 derived from this software without specific prior written permission.
15
16 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
17 EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
18 WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
20 DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
21 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
22 LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
23 ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
24 (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
25 SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26*/
27#pragma once
28
29namespace tiny_dnn {
30namespace core {
31namespace kernels {
32
33template <class T>
34T highest() {
35 return (std::numeric_limits<T>::max)();
36}
37
38template <class T>
39T lowest() {
40 return std::numeric_limits<T>::is_integer ? (std::numeric_limits<T>::min)() : (-(std::numeric_limits<T>::max)());
41}
42
43// We have to be able to detect and handle overflows in int32, so this function
44// uses doubles and int64's to make sure we have enough room.
45template <class T>
46int64_t float_to_quantized_unclamped(float_t input, float_t range_min, float_t range_max) {
47 if (range_min == range_max) {
48 return 0;
49 }
50 const int number_of_bits = sizeof(T) * 8;
51 const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
52 const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
53 const double range = ((range_max - range_min) * range_adjust);
54 const double range_scale = (number_of_steps / range);
55 int64_t quantized =
56 static_cast<int64_t>(round(input * range_scale) - round(range_min * range_scale));
57 const int64_t lowest_quantized =
58 static_cast<int64_t>(lowest<T>());
59 quantized += lowest_quantized;
60 return quantized;
61}
62
63inline int32_t int64_to_int32(int64_t src) {
64 assert(src <= std::numeric_limits<int32_t>::max() && src >= std::numeric_limits<int32_t>::min());
65 return static_cast<int32_t>(src);
66}
67
68// This converts the float into the final quantized type, clamping/saturating
69// any over or underflows.
70template <class T>
71T float_to_quantized(float_t input, float_t range_min, float_t range_max) {
72 int64_t quantized = float_to_quantized_unclamped<T>(input, range_min, range_max);
73 const int64_t lowest_quantized =
74 static_cast<int64_t>(lowest<T>());
75 const int64_t highest_quantized =
76 static_cast<int64_t>(highest<T>());
77 quantized = std::max<int64_t>(quantized, lowest_quantized);
78 quantized = std::min<int64_t>(quantized, highest_quantized);
79 return static_cast<T>(static_cast<int32_t>(quantized));
80}
81
82template <class T>
83float quantized_to_float(T input, float_t range_min, float_t range_max) {
84 if (range_min == range_max) {
85 return range_min;
86 }
87 const int number_of_bits = sizeof(T) * 8;
88 const int64_t number_of_steps = static_cast<int64_t>(1) << number_of_bits;
89 const double range_adjust = (number_of_steps / (number_of_steps - 1.0));
90 const double range = ((range_max - range_min) * range_adjust);
91 const double range_scale = (range / number_of_steps);
92 const int64_t lowest_quantized =
93 static_cast<int64_t>(lowest<T>());
94 const double offset_input = static_cast<double>(input) - lowest_quantized;
95 const double result = range_min + (offset_input * range_scale);
96 return static_cast<float_t>(result);
97}
98
99template <class T>
100float float_for_one_quantized_level(float_t range_min, float_t range_max) {
101 const int64_t highest_ = static_cast<int64_t>(highest<T>());
102 const int64_t lowest_ = static_cast<int64_t>(lowest<T>());
103 const float float_for_one_quantized_level =
104 (range_max - range_min) / (highest_ - lowest_);
105 return float_for_one_quantized_level;
106}
107
108template <class T1, class T2, class T3>
109void quantization_range_for_multiplication(float_t min_a, float_t max_a, float_t min_b,
110 float_t max_b, float_t* min_c,
111 float_t* max_c) {
112 const float_t a_float_for_one_quant_level =
113 float_for_one_quantized_level<T1>(min_a, max_a);
114 const float_t b_float_for_one_quant_level =
115 float_for_one_quantized_level<T2>(min_b, max_b);
116
117 const int64_t c_highest = static_cast<int64_t>(highest<T3>());
118 const int64_t c_lowest = static_cast<int64_t>(lowest<T3>());
119 const float c_float_for_one_quant_level =
120 a_float_for_one_quant_level * b_float_for_one_quant_level;
121
122 *min_c = c_float_for_one_quant_level * c_lowest;
123 *max_c = c_float_for_one_quant_level * c_highest;
124}
125
126template <class T1, class T2>
127inline T2 requantize_in_new_range(T1 input, float_t min_input, float_t max_input,
128 float_t min_new, float_t max_new) {
129 const float_t input_float = quantized_to_float<T1>(input, min_input, max_input);
130 return float_to_quantized<T2>(input_float, min_new, max_new);
131}
132
133template <class T1, class T2>
134inline void requantize_many_in_new_range(T1* input, size_t count, float_t min_input,
135 float_t max_input, float_t min_output,
136 float_t max_output, T2* output) {
137 for (size_t index = 0; index < count; ++index) {
138 const float_t input_float =
139 quantized_to_float<T1>(input[index], min_input, max_input);
140 output[index] = float_to_quantized<T2>(input_float, min_output, max_output);
141 }
142}
143
144// Because converting 32-bit accumulated results down to eight bit is a common
145// case, we have a specialized code path to handle it as efficiently as
146// possible using only fixed-point math for the inner loop.
147template <>
148inline void requantize_many_in_new_range<int32_t, uint8_t>(
149 int32_t* input, size_t count, float_t min_input, float_t max_input,
150 float_t min_output, float_t max_output, uint8_t* output) {
151 // Initially we calculate all the constants we need once, before we go into
152 // the inner loop.
153 const int fp_shift = 16;
154 const float input_range = max_input - min_input;
155 const float output_range = max_output - min_output;
156 const float recip_output_range = (255.0f / output_range);
157 const int64_t recip_output_range_fp =
158 static_cast<int64_t>(recip_output_range * (1 << fp_shift));
159 const int64_t range_scale_fp =
160 static_cast<int64_t>(255.0f * (1 << fp_shift) * input_range / output_range);
161 const int64_t input_offset_fp =
162 static_cast<int64_t>((min_input * recip_output_range_fp) + (range_scale_fp >> 1));
163 const int64_t output_offset_fp = static_cast<int64_t>(round((min_output * 255.0f) / output_range));
164 const int64_t rounding_delta = 1 << (fp_shift - 1);
165 // Inside this loop we just do minimal adds, multiplies, and shifts, in a way
166 // that could be easily adapted for a SIMD implementation. It should also be
167 // possible to perform all the calculations in 32-bit rather than 64, but
168 // that's not been implemented yet.
169 for (size_t index = 0; index < count; ++index) {
170 const int64_t input_value = static_cast<int64_t>(input[index]);
171 const int64_t fp_value =
172 ((input_value * range_scale_fp) >> 32) + input_offset_fp;
173 const int64_t round_intermediate =
174 ((fp_value >= 0) ? (fp_value + rounding_delta)
175 : (fp_value - rounding_delta)) >>
176 fp_shift;
177 int64_t quantized_int64 = (round_intermediate - output_offset_fp);
178 quantized_int64 = std::max<int64_t>(quantized_int64, 0LL);
179 quantized_int64 = std::min<int64_t>(quantized_int64, 255LL);
180 output[index] = static_cast<uint8_t>(static_cast<int32_t>(quantized_int64));
181 }
182}
183
184// REQUIRES: 'result->NumElements() == input.NumElements()'
185template <class T>
186void float_tensor_to_quantized_in_place(const vec_t& input, float_t min, float_t max,
187 std::vector<T>* result) {
188 const size_t data_size = input.size();
189 for (size_t i = 0; i < data_size; ++i) {
190 (*result)[i] = float_to_quantized<T>(input[i], min, max);
191 }
192}
193
194template <class T>
195std::vector<T> float_tensor_to_quantized(const vec_t& input, float_t min, float_t max) {
196 std::vector<T> result(input.size(), static_cast<T>(0));
197 float_tensor_to_quantized_in_place<T>(input, min, max, &result);
198 return result;
199}
200
201// REQUIRES: 'result->NumElements() == input.NumElements()'
202template <class T>
203void quantized_tensor_to_float_in_place(const std::vector<T>& input, float_t min, float_t max,
204 vec_t* result) {
205 const size_t data_size = input.size();
206 for (size_t i = 0; i < data_size; ++i) {
207 (*result)[i] = quantized_to_float<T>(input[i], min, max);
208 }
209}
210
211template <class T>
212vec_t quantized_tensor_to_float(const std::vector<T>& input, float_t min, float_t max) {
213 vec_t result(input.size(), static_cast<float_t>(0));
214 quantized_tensor_to_float_in_place<T>(input, min, max, &result);
215 return result;
216}
217
218template <class T1, class T2>
219void quantize_down_and_shrink_range( std::vector<T1>& input, float_t min_input, float_t max_input,
220 float_t* min_new, float_t* max_new, std::vector<T2>* output){
221 const int32_t input_lowest_quantized = static_cast<int32_t>(lowest<T1>());
222 const int32_t input_highest_quantized = static_cast<int32_t>(highest<T1>());
223 T1 actual_min_quantized = input_highest_quantized;
224 T1 actual_max_quantized = input_lowest_quantized;
225 for (serial_size_t i = 0; i < input.size(); ++i) {
226 const T1 value = input[i];
227 actual_min_quantized = std::min(actual_min_quantized, value);
228 actual_max_quantized = std::max(actual_max_quantized, value);
229 }
230 // We want to make sure that the minimum is no larger than zero, so that the
231 // convolution operation can run efficiently.
232 *min_new = std::min(0.0f, quantized_to_float(actual_min_quantized, min_input,
233 max_input));
234 *max_new = quantized_to_float(actual_max_quantized, min_input, max_input);
235 requantize_many_in_new_range<int32_t, uint8_t>(&input[0], input.size(),
236 min_input, max_input, *min_new,
237 *max_new, &(*output)[0]);
238}
239
240} // namespace kernels
241} // namespace core
242} // namespace tiny_dnn