blob: ed595c3e0faf8b60a0e11f8cef7e1b32f06f45f2
1 | /* |
2 | * This file is part of FFmpeg. |
3 | * |
4 | * FFmpeg is free software; you can redistribute it and/or |
5 | * modify it under the terms of the GNU Lesser General Public |
6 | * License as published by the Free Software Foundation; either |
7 | * version 2.1 of the License, or (at your option) any later version. |
8 | * |
9 | * FFmpeg is distributed in the hope that it will be useful, |
10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
12 | * Lesser General Public License for more details. |
13 | * |
14 | * You should have received a copy of the GNU Lesser General Public |
15 | * License along with FFmpeg; if not, write to the Free Software |
16 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
17 | */ |
18 | |
19 | #include "buffer.h" |
20 | #include "common.h" |
21 | #include "hwcontext.h" |
22 | #include "hwcontext_internal.h" |
23 | #include "hwcontext_cuda_internal.h" |
24 | #include "mem.h" |
25 | #include "pixdesc.h" |
26 | #include "pixfmt.h" |
27 | |
28 | #define CUDA_FRAME_ALIGNMENT 256 |
29 | |
30 | typedef struct CUDAFramesContext { |
31 | int shift_width, shift_height; |
32 | } CUDAFramesContext; |
33 | |
34 | static const enum AVPixelFormat supported_formats[] = { |
35 | AV_PIX_FMT_NV12, |
36 | AV_PIX_FMT_YUV420P, |
37 | AV_PIX_FMT_YUV444P, |
38 | AV_PIX_FMT_P010, |
39 | AV_PIX_FMT_P016, |
40 | }; |
41 | |
42 | static int cuda_frames_get_constraints(AVHWDeviceContext *ctx, |
43 | const void *hwconfig, |
44 | AVHWFramesConstraints *constraints) |
45 | { |
46 | int i; |
47 | |
48 | constraints->valid_sw_formats = av_malloc_array(FF_ARRAY_ELEMS(supported_formats) + 1, |
49 | sizeof(*constraints->valid_sw_formats)); |
50 | if (!constraints->valid_sw_formats) |
51 | return AVERROR(ENOMEM); |
52 | |
53 | for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) |
54 | constraints->valid_sw_formats[i] = supported_formats[i]; |
55 | constraints->valid_sw_formats[FF_ARRAY_ELEMS(supported_formats)] = AV_PIX_FMT_NONE; |
56 | |
57 | constraints->valid_hw_formats = av_malloc_array(2, sizeof(*constraints->valid_hw_formats)); |
58 | if (!constraints->valid_hw_formats) |
59 | return AVERROR(ENOMEM); |
60 | |
61 | constraints->valid_hw_formats[0] = AV_PIX_FMT_CUDA; |
62 | constraints->valid_hw_formats[1] = AV_PIX_FMT_NONE; |
63 | |
64 | return 0; |
65 | } |
66 | |
67 | static void cuda_buffer_free(void *opaque, uint8_t *data) |
68 | { |
69 | AVHWFramesContext *ctx = opaque; |
70 | AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx; |
71 | CudaFunctions *cu = hwctx->internal->cuda_dl; |
72 | |
73 | CUcontext dummy; |
74 | |
75 | cu->cuCtxPushCurrent(hwctx->cuda_ctx); |
76 | |
77 | cu->cuMemFree((CUdeviceptr)data); |
78 | |
79 | cu->cuCtxPopCurrent(&dummy); |
80 | } |
81 | |
82 | static AVBufferRef *cuda_pool_alloc(void *opaque, int size) |
83 | { |
84 | AVHWFramesContext *ctx = opaque; |
85 | AVCUDADeviceContext *hwctx = ctx->device_ctx->hwctx; |
86 | CudaFunctions *cu = hwctx->internal->cuda_dl; |
87 | |
88 | AVBufferRef *ret = NULL; |
89 | CUcontext dummy = NULL; |
90 | CUdeviceptr data; |
91 | CUresult err; |
92 | |
93 | err = cu->cuCtxPushCurrent(hwctx->cuda_ctx); |
94 | if (err != CUDA_SUCCESS) { |
95 | av_log(ctx, AV_LOG_ERROR, "Error setting current CUDA context\n"); |
96 | return NULL; |
97 | } |
98 | |
99 | err = cu->cuMemAlloc(&data, size); |
100 | if (err != CUDA_SUCCESS) |
101 | goto fail; |
102 | |
103 | ret = av_buffer_create((uint8_t*)data, size, cuda_buffer_free, ctx, 0); |
104 | if (!ret) { |
105 | cu->cuMemFree(data); |
106 | goto fail; |
107 | } |
108 | |
109 | fail: |
110 | cu->cuCtxPopCurrent(&dummy); |
111 | return ret; |
112 | } |
113 | |
114 | static int cuda_frames_init(AVHWFramesContext *ctx) |
115 | { |
116 | CUDAFramesContext *priv = ctx->internal->priv; |
117 | int aligned_width = FFALIGN(ctx->width, CUDA_FRAME_ALIGNMENT); |
118 | int i; |
119 | |
120 | for (i = 0; i < FF_ARRAY_ELEMS(supported_formats); i++) { |
121 | if (ctx->sw_format == supported_formats[i]) |
122 | break; |
123 | } |
124 | if (i == FF_ARRAY_ELEMS(supported_formats)) { |
125 | av_log(ctx, AV_LOG_ERROR, "Pixel format '%s' is not supported\n", |
126 | av_get_pix_fmt_name(ctx->sw_format)); |
127 | return AVERROR(ENOSYS); |
128 | } |
129 | |
130 | av_pix_fmt_get_chroma_sub_sample(ctx->sw_format, &priv->shift_width, &priv->shift_height); |
131 | |
132 | if (!ctx->pool) { |
133 | int size; |
134 | |
135 | switch (ctx->sw_format) { |
136 | case AV_PIX_FMT_NV12: |
137 | case AV_PIX_FMT_YUV420P: |
138 | size = aligned_width * ctx->height * 3 / 2; |
139 | break; |
140 | case AV_PIX_FMT_YUV444P: |
141 | case AV_PIX_FMT_P010: |
142 | case AV_PIX_FMT_P016: |
143 | size = aligned_width * ctx->height * 3; |
144 | break; |
145 | default: |
146 | av_log(ctx, AV_LOG_ERROR, "BUG: Pixel format missing from size calculation."); |
147 | return AVERROR_BUG; |
148 | } |
149 | |
150 | ctx->internal->pool_internal = av_buffer_pool_init2(size, ctx, cuda_pool_alloc, NULL); |
151 | if (!ctx->internal->pool_internal) |
152 | return AVERROR(ENOMEM); |
153 | } |
154 | |
155 | return 0; |
156 | } |
157 | |
158 | static int cuda_get_buffer(AVHWFramesContext *ctx, AVFrame *frame) |
159 | { |
160 | int aligned_width; |
161 | int width_in_bytes = ctx->width; |
162 | |
163 | if (ctx->sw_format == AV_PIX_FMT_P010 || |
164 | ctx->sw_format == AV_PIX_FMT_P016) { |
165 | width_in_bytes *= 2; |
166 | } |
167 | aligned_width = FFALIGN(width_in_bytes, CUDA_FRAME_ALIGNMENT); |
168 | |
169 | frame->buf[0] = av_buffer_pool_get(ctx->pool); |
170 | if (!frame->buf[0]) |
171 | return AVERROR(ENOMEM); |
172 | |
173 | switch (ctx->sw_format) { |
174 | case AV_PIX_FMT_NV12: |
175 | case AV_PIX_FMT_P010: |
176 | case AV_PIX_FMT_P016: |
177 | frame->data[0] = frame->buf[0]->data; |
178 | frame->data[1] = frame->data[0] + aligned_width * ctx->height; |
179 | frame->linesize[0] = aligned_width; |
180 | frame->linesize[1] = aligned_width; |
181 | break; |
182 | case AV_PIX_FMT_YUV420P: |
183 | frame->data[0] = frame->buf[0]->data; |
184 | frame->data[2] = frame->data[0] + aligned_width * ctx->height; |
185 | frame->data[1] = frame->data[2] + aligned_width * ctx->height / 4; |
186 | frame->linesize[0] = aligned_width; |
187 | frame->linesize[1] = aligned_width / 2; |
188 | frame->linesize[2] = aligned_width / 2; |
189 | break; |
190 | case AV_PIX_FMT_YUV444P: |
191 | frame->data[0] = frame->buf[0]->data; |
192 | frame->data[1] = frame->data[0] + aligned_width * ctx->height; |
193 | frame->data[2] = frame->data[1] + aligned_width * ctx->height; |
194 | frame->linesize[0] = aligned_width; |
195 | frame->linesize[1] = aligned_width; |
196 | frame->linesize[2] = aligned_width; |
197 | break; |
198 | default: |
199 | av_frame_unref(frame); |
200 | return AVERROR_BUG; |
201 | } |
202 | |
203 | frame->format = AV_PIX_FMT_CUDA; |
204 | frame->width = ctx->width; |
205 | frame->height = ctx->height; |
206 | |
207 | return 0; |
208 | } |
209 | |
210 | static int cuda_transfer_get_formats(AVHWFramesContext *ctx, |
211 | enum AVHWFrameTransferDirection dir, |
212 | enum AVPixelFormat **formats) |
213 | { |
214 | enum AVPixelFormat *fmts; |
215 | |
216 | fmts = av_malloc_array(2, sizeof(*fmts)); |
217 | if (!fmts) |
218 | return AVERROR(ENOMEM); |
219 | |
220 | fmts[0] = ctx->sw_format; |
221 | fmts[1] = AV_PIX_FMT_NONE; |
222 | |
223 | *formats = fmts; |
224 | |
225 | return 0; |
226 | } |
227 | |
228 | static int cuda_transfer_data_from(AVHWFramesContext *ctx, AVFrame *dst, |
229 | const AVFrame *src) |
230 | { |
231 | CUDAFramesContext *priv = ctx->internal->priv; |
232 | AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx; |
233 | CudaFunctions *cu = device_hwctx->internal->cuda_dl; |
234 | |
235 | CUcontext dummy; |
236 | CUresult err; |
237 | int i; |
238 | |
239 | err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx); |
240 | if (err != CUDA_SUCCESS) |
241 | return AVERROR_UNKNOWN; |
242 | |
243 | for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { |
244 | CUDA_MEMCPY2D cpy = { |
245 | .srcMemoryType = CU_MEMORYTYPE_DEVICE, |
246 | .dstMemoryType = CU_MEMORYTYPE_HOST, |
247 | .srcDevice = (CUdeviceptr)src->data[i], |
248 | .dstHost = dst->data[i], |
249 | .srcPitch = src->linesize[i], |
250 | .dstPitch = dst->linesize[i], |
251 | .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), |
252 | .Height = src->height >> (i ? priv->shift_height : 0), |
253 | }; |
254 | |
255 | err = cu->cuMemcpy2D(&cpy); |
256 | if (err != CUDA_SUCCESS) { |
257 | av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); |
258 | return AVERROR_UNKNOWN; |
259 | } |
260 | } |
261 | |
262 | cu->cuCtxPopCurrent(&dummy); |
263 | |
264 | return 0; |
265 | } |
266 | |
267 | static int cuda_transfer_data_to(AVHWFramesContext *ctx, AVFrame *dst, |
268 | const AVFrame *src) |
269 | { |
270 | CUDAFramesContext *priv = ctx->internal->priv; |
271 | AVCUDADeviceContext *device_hwctx = ctx->device_ctx->hwctx; |
272 | CudaFunctions *cu = device_hwctx->internal->cuda_dl; |
273 | |
274 | CUcontext dummy; |
275 | CUresult err; |
276 | int i; |
277 | |
278 | err = cu->cuCtxPushCurrent(device_hwctx->cuda_ctx); |
279 | if (err != CUDA_SUCCESS) |
280 | return AVERROR_UNKNOWN; |
281 | |
282 | for (i = 0; i < FF_ARRAY_ELEMS(src->data) && src->data[i]; i++) { |
283 | CUDA_MEMCPY2D cpy = { |
284 | .srcMemoryType = CU_MEMORYTYPE_HOST, |
285 | .dstMemoryType = CU_MEMORYTYPE_DEVICE, |
286 | .srcHost = src->data[i], |
287 | .dstDevice = (CUdeviceptr)dst->data[i], |
288 | .srcPitch = src->linesize[i], |
289 | .dstPitch = dst->linesize[i], |
290 | .WidthInBytes = FFMIN(src->linesize[i], dst->linesize[i]), |
291 | .Height = src->height >> (i ? priv->shift_height : 0), |
292 | }; |
293 | |
294 | err = cu->cuMemcpy2D(&cpy); |
295 | if (err != CUDA_SUCCESS) { |
296 | av_log(ctx, AV_LOG_ERROR, "Error transferring the data from the CUDA frame\n"); |
297 | return AVERROR_UNKNOWN; |
298 | } |
299 | } |
300 | |
301 | cu->cuCtxPopCurrent(&dummy); |
302 | |
303 | return 0; |
304 | } |
305 | |
306 | static void cuda_device_uninit(AVHWDeviceContext *ctx) |
307 | { |
308 | AVCUDADeviceContext *hwctx = ctx->hwctx; |
309 | |
310 | if (hwctx->internal) { |
311 | if (hwctx->internal->is_allocated && hwctx->cuda_ctx) { |
312 | hwctx->internal->cuda_dl->cuCtxDestroy(hwctx->cuda_ctx); |
313 | hwctx->cuda_ctx = NULL; |
314 | } |
315 | cuda_free_functions(&hwctx->internal->cuda_dl); |
316 | } |
317 | |
318 | av_freep(&hwctx->internal); |
319 | } |
320 | |
321 | static int cuda_device_init(AVHWDeviceContext *ctx) |
322 | { |
323 | AVCUDADeviceContext *hwctx = ctx->hwctx; |
324 | int ret; |
325 | |
326 | if (!hwctx->internal) { |
327 | hwctx->internal = av_mallocz(sizeof(*hwctx->internal)); |
328 | if (!hwctx->internal) |
329 | return AVERROR(ENOMEM); |
330 | } |
331 | |
332 | if (!hwctx->internal->cuda_dl) { |
333 | ret = cuda_load_functions(&hwctx->internal->cuda_dl); |
334 | if (ret < 0) { |
335 | av_log(ctx, AV_LOG_ERROR, "Could not dynamically load CUDA\n"); |
336 | goto error; |
337 | } |
338 | } |
339 | |
340 | return 0; |
341 | |
342 | error: |
343 | cuda_device_uninit(ctx); |
344 | return ret; |
345 | } |
346 | |
347 | static int cuda_device_create(AVHWDeviceContext *ctx, const char *device, |
348 | AVDictionary *opts, int flags) |
349 | { |
350 | AVCUDADeviceContext *hwctx = ctx->hwctx; |
351 | CudaFunctions *cu; |
352 | CUdevice cu_device; |
353 | CUcontext dummy; |
354 | CUresult err; |
355 | int device_idx = 0; |
356 | |
357 | if (device) |
358 | device_idx = strtol(device, NULL, 0); |
359 | |
360 | if (cuda_device_init(ctx) < 0) |
361 | goto error; |
362 | |
363 | cu = hwctx->internal->cuda_dl; |
364 | |
365 | err = cu->cuInit(0); |
366 | if (err != CUDA_SUCCESS) { |
367 | av_log(ctx, AV_LOG_ERROR, "Could not initialize the CUDA driver API\n"); |
368 | goto error; |
369 | } |
370 | |
371 | err = cu->cuDeviceGet(&cu_device, device_idx); |
372 | if (err != CUDA_SUCCESS) { |
373 | av_log(ctx, AV_LOG_ERROR, "Could not get the device number %d\n", device_idx); |
374 | goto error; |
375 | } |
376 | |
377 | err = cu->cuCtxCreate(&hwctx->cuda_ctx, CU_CTX_SCHED_BLOCKING_SYNC, cu_device); |
378 | if (err != CUDA_SUCCESS) { |
379 | av_log(ctx, AV_LOG_ERROR, "Error creating a CUDA context\n"); |
380 | goto error; |
381 | } |
382 | |
383 | cu->cuCtxPopCurrent(&dummy); |
384 | |
385 | hwctx->internal->is_allocated = 1; |
386 | |
387 | return 0; |
388 | |
389 | error: |
390 | cuda_device_uninit(ctx); |
391 | return AVERROR_UNKNOWN; |
392 | } |
393 | |
394 | const HWContextType ff_hwcontext_type_cuda = { |
395 | .type = AV_HWDEVICE_TYPE_CUDA, |
396 | .name = "CUDA", |
397 | |
398 | .device_hwctx_size = sizeof(AVCUDADeviceContext), |
399 | .frames_priv_size = sizeof(CUDAFramesContext), |
400 | |
401 | .device_create = cuda_device_create, |
402 | .device_init = cuda_device_init, |
403 | .device_uninit = cuda_device_uninit, |
404 | .frames_get_constraints = cuda_frames_get_constraints, |
405 | .frames_init = cuda_frames_init, |
406 | .frames_get_buffer = cuda_get_buffer, |
407 | .transfer_get_formats = cuda_transfer_get_formats, |
408 | .transfer_data_to = cuda_transfer_data_to, |
409 | .transfer_data_from = cuda_transfer_data_from, |
410 | |
411 | .pix_fmts = (const enum AVPixelFormat[]){ AV_PIX_FMT_CUDA, AV_PIX_FMT_NONE }, |
412 | }; |
413 |