Tutorial: Encoding an MP4 File
This tutorial shows how to use the Transcode API to encode an MP4 file, using H.264 for the video stream and AAC for the audio stream.
- Headers and Library Files
- Define the Encoding Profiles
- Write the wmain Function
- Encode the File
- Media Session Helper
- Related topics
Headers and Library Files
Include the following header files.
#include <new>
#include <iostream>
#include <windows.h>
#include <mfapi.h>
#include <Mfidl.h>
#include <shlwapi.h>
Link the following library files.
#pragma comment(lib, "mfplat")
#pragma comment(lib, "mf")
#pragma comment(lib, "mfuuid")
#pragma comment(lib, "shlwapi")
Define the Encoding Profiles
One approach to encoding is to define a list of target encoding profiles that are known in advance. For this tutorial, we take a relatively simple approach, and store a list of encoding formats for H.264 video and AAC audio.
For H.264, the most important format attributes are the H.264 profile, the frame rate, the frame size, and the encoded bit rate. The following array contains a list of H.264 encoding formats.
struct H264ProfileInfo
{
UINT32 profile;
MFRatio fps;
MFRatio frame_size;
UINT32 bitrate;
};
H264ProfileInfo h264_profiles[] =
{
{ eAVEncH264VProfile_Base, { 15, 1 }, { 176, 144 }, 128000 },
{ eAVEncH264VProfile_Base, { 15, 1 }, { 352, 288 }, 384000 },
{ eAVEncH264VProfile_Base, { 30, 1 }, { 352, 288 }, 384000 },
{ eAVEncH264VProfile_Base, { 29970, 1000 }, { 320, 240 }, 528560 },
{ eAVEncH264VProfile_Base, { 15, 1 }, { 720, 576 }, 4000000 },
{ eAVEncH264VProfile_Main, { 25, 1 }, { 720, 576 }, 10000000 },
{ eAVEncH264VProfile_Main, { 30, 1 }, { 352, 288 }, 10000000 },
};
H.264 profiles are specified using the eAVEncH264VProfile enumeration. You could also specify the H.264 level, but the Microsoft Media Foundation H.264 Video Encoder can derive the proper level for a given video stream, so it is recommended not to override the encoder's selected level. For interlaced content, you would also specify the interlace mode (see Video Interlacing).
For AAC audio, the most important format attributes are the audio sample rate, the number of channels, the number of bits per sample, and the encoded bit rate. Optionally, you can set the AAC audio profile level indication. For more information, see AAC Encoder. The following array contains a list of AAC encoding formats.
struct AACProfileInfo
{
UINT32 samplesPerSec;
UINT32 numChannels;
UINT32 bitsPerSample;
UINT32 bytesPerSec;
UINT32 aacProfile;
};
AACProfileInfo aac_profiles[] =
{
{ 96000, 2, 16, 24000, 0x29},
{ 48000, 2, 16, 24000, 0x29},
{ 44100, 2, 16, 16000, 0x29},
{ 44100, 2, 16, 12000, 0x29},
};
Note
The H264ProfileInfo
and AACProfileInfo
structures defined here are not part of the Media Foundation API.
Write the wmain Function
The following code shows the entry point for the console application.
int video_profile = 0;
int audio_profile = 0;
int wmain(int argc, wchar_t* argv[])
{
HeapSetInformation(NULL, HeapEnableTerminationOnCorruption, NULL, 0);
if (argc < 3 || argc > 5)
{
std::cout << "Usage:" << std::endl;
std::cout << "input output [ audio_profile video_profile ]" << std::endl;
return 1;
}
if (argc > 3)
{
audio_profile = _wtoi(argv[3]);
}
if (argc > 4)
{
video_profile = _wtoi(argv[4]);
}
HRESULT hr = CoInitializeEx(NULL, COINIT_APARTMENTTHREADED);
if (SUCCEEDED(hr))
{
hr = MFStartup(MF_VERSION);
if (SUCCEEDED(hr))
{
hr = EncodeFile(argv[1], argv[2]);
MFShutdown();
}
CoUninitialize();
}
if (SUCCEEDED(hr))
{
std::cout << "Done." << std::endl;
}
else
{
std::cout << "Error: " << std::hex << hr << std::endl;
}
return 0;
}
The wmain
function does the following:
- Calls the CoInitializeEx function to initialize the COM library.
- Calls the MFStartup function to initialize Media Foundation.
- Calls the application-defined
EncodeFile
function. This function transcodes the input file to the output file, and is shown in the next section. - Calls the MFShutdown function to shut down Media Foundation.
- Call the CoUninitialize function to uninitialize the COM library.
Encode the File
The following code shows EncodeFile
function, which performs the transcoding. This function consists mostly of calls to other application-defined functions, which are shown later in this topic.
HRESULT EncodeFile(PCWSTR pszInput, PCWSTR pszOutput)
{
IMFTranscodeProfile *pProfile = NULL;
IMFMediaSource *pSource = NULL;
IMFTopology *pTopology = NULL;
CSession *pSession = NULL;
MFTIME duration = 0;
HRESULT hr = CreateMediaSource(pszInput, &pSource);
if (FAILED(hr))
{
goto done;
}
hr = GetSourceDuration(pSource, &duration);
if (FAILED(hr))
{
goto done;
}
hr = CreateTranscodeProfile(&pProfile);
if (FAILED(hr))
{
goto done;
}
hr = MFCreateTranscodeTopology(pSource, pszOutput, pProfile, &pTopology);
if (FAILED(hr))
{
goto done;
}
hr = CSession::Create(&pSession);
if (FAILED(hr))
{
goto done;
}
hr = pSession->StartEncodingSession(pTopology);
if (FAILED(hr))
{
goto done;
}
hr = RunEncodingSession(pSession, duration);
done:
if (pSource)
{
pSource->Shutdown();
}
SafeRelease(&pSession);
SafeRelease(&pProfile);
SafeRelease(&pSource);
SafeRelease(&pTopology);
return hr;
}
The EncodeFile
function performs the following steps.
- Creates a media source for the input file, using the URL or file path of the input file. (See Create the Media Source.)
- Gets the duration of the input file. (See Get the Source Duration.)
- Create the transcode profile. (See Create the Transcode Profile.)
- Call MFCreateTranscodeTopology to create the partial transcode topology.
- Create a helper object that manages the Media Session. (See Media Session Helper).
- Run the encoding session and wait for it to complete. (See Run the Encoding Session.)
- Call IMFMediaSource::Shutdown to shut down the media source.
- Release interface pointers. This code uses the SafeRelease function to release interface pointers. Another option is to use a COM smart pointer class, such as CComPtr.
Create the Media Source
The media source is the object that reads and parses the input file. To create the media source, pass the URL of the input file to the Source Resolver. The following code shows how to do this.
HRESULT CreateMediaSource(PCWSTR pszURL, IMFMediaSource **ppSource)
{
MF_OBJECT_TYPE ObjectType = MF_OBJECT_INVALID;
IMFSourceResolver* pResolver = NULL;
IUnknown* pSource = NULL;
// Create the source resolver.
HRESULT hr = MFCreateSourceResolver(&pResolver);
if (FAILED(hr))
{
goto done;
}
// Use the source resolver to create the media source
hr = pResolver->CreateObjectFromURL(pszURL, MF_RESOLUTION_MEDIASOURCE,
NULL, &ObjectType, &pSource);
if (FAILED(hr))
{
goto done;
}
// Get the IMFMediaSource interface from the media source.
hr = pSource->QueryInterface(IID_PPV_ARGS(ppSource));
done:
SafeRelease(&pResolver);
SafeRelease(&pSource);
return hr;
}
For more information, see Using the Source Resolver.
Get the Source Duration
Although not required, it is useful to query the media source for the duration of the input file. This value can be used to track the encoding progress. The duration is stored in the MF_PD_DURATION attribute of the presentation descriptor. Get the presentation descriptor by calling IMFMediaSource::CreatePresentationDescriptor.
HRESULT GetSourceDuration(IMFMediaSource *pSource, MFTIME *pDuration)
{
*pDuration = 0;
IMFPresentationDescriptor *pPD = NULL;
HRESULT hr = pSource->CreatePresentationDescriptor(&pPD);
if (SUCCEEDED(hr))
{
hr = pPD->GetUINT64(MF_PD_DURATION, (UINT64*)pDuration);
pPD->Release();
}
return hr;
}
Create the Transcode Profile
The transcode profile describes the encoding parameters. For more information about creating a transcode profile, see Using the Transcode API. To create the profile, perform the following steps.
- Call MFCreateTranscodeProfile to create the empty profile.
- Create a media type for the AAC audio stream. Add it to the profile by calling IMFTranscodeProfile::SetAudioAttributes.
- Create a media type for the H.264 video stream. Add it to the profile by calling IMFTranscodeProfile::SetVideoAttributes.
- Call MFCreateAttributes to create an attribute store for the container-level attributes.
- Set the MF_TRANSCODE_CONTAINERTYPE attribute. This is the only required container-level attribute. For MP4 file output, set this attribute to MFTranscodeContainerType_MPEG4.
- Call IMFTranscodeProfile::SetContainerAttributes to set the container-level attributes.
The following code shows these steps.
HRESULT CreateTranscodeProfile(IMFTranscodeProfile **ppProfile)
{
IMFTranscodeProfile *pProfile = NULL;
IMFAttributes *pAudio = NULL;
IMFAttributes *pVideo = NULL;
IMFAttributes *pContainer = NULL;
HRESULT hr = MFCreateTranscodeProfile(&pProfile);
if (FAILED(hr))
{
goto done;
}
// Audio attributes.
hr = CreateAACProfile(audio_profile, &pAudio);
if (FAILED(hr))
{
goto done;
}
hr = pProfile->SetAudioAttributes(pAudio);
if (FAILED(hr))
{
goto done;
}
// Video attributes.
hr = CreateH264Profile(video_profile, &pVideo);
if (FAILED(hr))
{
goto done;
}
hr = pProfile->SetVideoAttributes(pVideo);
if (FAILED(hr))
{
goto done;
}
// Container attributes.
hr = MFCreateAttributes(&pContainer, 1);
if (FAILED(hr))
{
goto done;
}
hr = pContainer->SetGUID(MF_TRANSCODE_CONTAINERTYPE, MFTranscodeContainerType_MPEG4);
if (FAILED(hr))
{
goto done;
}
hr = pProfile->SetContainerAttributes(pContainer);
if (FAILED(hr))
{
goto done;
}
*ppProfile = pProfile;
(*ppProfile)->AddRef();
done:
SafeRelease(&pProfile);
SafeRelease(&pAudio);
SafeRelease(&pVideo);
SafeRelease(&pContainer);
return hr;
}
To specify the attributes for the H.264 video stream, create an attribute store and set the following attributes:
Attribute | Desription |
---|---|
MF_MT_SUBTYPE | Set to MFVideoFormat_H264. |
MF_MT_MPEG2_PROFILE | H.264 profile. |
MF_MT_FRAME_SIZE | Frame size. |
MF_MT_FRAME_RATE | Frame rate. |
MF_MT_AVG_BITRATE | Encoded bit rate. |
To specify the attributes for the AAC audio stream, create an attribute store and set the following attributes:
Attribute | Desription |
---|---|
MF_MT_SUBTYPE | Set to MFAudioFormat_AAC |
MF_MT_AUDIO_SAMPLES_PER_SECOND | Audio sample rate. |
MF_MT_AUDIO_BITS_PER_SAMPLE | Bits per audio sample. |
MF_MT_AUDIO_NUM_CHANNELS | Number of audio channels. |
MF_MT_AUDIO_AVG_BYTES_PER_SECOND | Encoded bit rate. |
MF_MT_AUDIO_BLOCK_ALIGNMENT | Set to 1. |
MF_MT_AAC_AUDIO_PROFILE_LEVEL_INDICATION | AAC profile level indication (optional). |
The following code creates the video stream attributes.
HRESULT CreateH264Profile(DWORD index, IMFAttributes **ppAttributes)
{
if (index >= ARRAYSIZE(h264_profiles))
{
return E_INVALIDARG;
}
IMFAttributes *pAttributes = NULL;
const H264ProfileInfo& profile = h264_profiles[index];
HRESULT hr = MFCreateAttributes(&pAttributes, 5);
if (SUCCEEDED(hr))
{
hr = pAttributes->SetGUID(MF_MT_SUBTYPE, MFVideoFormat_H264);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(MF_MT_MPEG2_PROFILE, profile.profile);
}
if (SUCCEEDED(hr))
{
hr = MFSetAttributeSize(
pAttributes, MF_MT_FRAME_SIZE,
profile.frame_size.Numerator, profile.frame_size.Numerator);
}
if (SUCCEEDED(hr))
{
hr = MFSetAttributeRatio(
pAttributes, MF_MT_FRAME_RATE,
profile.fps.Numerator, profile.fps.Denominator);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(MF_MT_AVG_BITRATE, profile.bitrate);
}
if (SUCCEEDED(hr))
{
*ppAttributes = pAttributes;
(*ppAttributes)->AddRef();
}
SafeRelease(&pAttributes);
return hr;
}
The following code creates the audio stream attributes.
HRESULT CreateAACProfile(DWORD index, IMFAttributes **ppAttributes)
{
if (index >= ARRAYSIZE(aac_profiles))
{
return E_INVALIDARG;
}
const AACProfileInfo& profile = aac_profiles[index];
IMFAttributes *pAttributes = NULL;
HRESULT hr = MFCreateAttributes(&pAttributes, 7);
if (SUCCEEDED(hr))
{
hr = pAttributes->SetGUID(MF_MT_SUBTYPE, MFAudioFormat_AAC);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(
MF_MT_AUDIO_BITS_PER_SAMPLE, profile.bitsPerSample);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(
MF_MT_AUDIO_SAMPLES_PER_SECOND, profile.samplesPerSec);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(
MF_MT_AUDIO_NUM_CHANNELS, profile.numChannels);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(
MF_MT_AUDIO_AVG_BYTES_PER_SECOND, profile.bytesPerSec);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(MF_MT_AUDIO_BLOCK_ALIGNMENT, 1);
}
if (SUCCEEDED(hr))
{
hr = pAttributes->SetUINT32(
MF_MT_AAC_AUDIO_PROFILE_LEVEL_INDICATION, profile.aacProfile);
}
if (SUCCEEDED(hr))
{
*ppAttributes = pAttributes;
(*ppAttributes)->AddRef();
}
SafeRelease(&pAttributes);
return hr;
}
Note that the transcode API does not require a true media type, although it uses media-type attributes. In particular, the MF_MT_MAJOR_TYPE attribute it not required, because the SetVideoAttributes and SetAudioAttributes methods imply the major type. However, it also valid to pass an actual media type to these methods. (The IMFMediaType interface inherits IMFAttributes.)
Run the Encoding Session
The following code runs the encoding session. It uses the Media Session helper class, which is shown in the next section.
HRESULT RunEncodingSession(CSession *pSession, MFTIME duration)
{
const DWORD WAIT_PERIOD = 500;
const int UPDATE_INCR = 5;
HRESULT hr = S_OK;
MFTIME pos;
LONGLONG prev = 0;
while (1)
{
hr = pSession->Wait(WAIT_PERIOD);
if (hr == E_PENDING)
{
hr = pSession->GetEncodingPosition(&pos);
LONGLONG percent = (100 * pos) / duration ;
if (percent >= prev + UPDATE_INCR)
{
std::cout << percent << "% .. ";
prev = percent;
}
}
else
{
std::cout << std::endl;
break;
}
}
return hr;
}
Media Session Helper
The Media Session is described more fully in the Media Foundation Architecture section of this documentation. The Media Session uses an asynchronous event model. In a GUI application, you should respond to session events without blocking the UI thread to wait for the next event. The tutorial How to Play Unprotected Media Files shows how to do this in a playback application. For encoding, the principle is the same, but fewer events are relevant:
Event | Desription |
---|---|
MESessionEnded | Raised when the encoding is complete. |
MESessionClosed | Raised when the IMFMediaSession::Close method completes. After this event is raised, it is safe to shut down the Media Session. |
For a console application, it is reasonable to block and wait for events. Depending on the source file and the encoding settings, it might take awhile to complete the encoding. You can get progress updates as follows:
- Call IMFMediaSession::GetClock to get the presentation clock.
- Query the clock for the IMFPresentationClock interface.
- Call IMFPresentationClock::GetTime to get the current position.
- The position is given in units of time. To get the percentage completed, use the value
(100 * position) / duration
.
Here is the declaration of the CSession
class.
class CSession : public IMFAsyncCallback
{
public:
static HRESULT Create(CSession **ppSession);
// IUnknown methods
STDMETHODIMP QueryInterface(REFIID riid, void** ppv);
STDMETHODIMP_(ULONG) AddRef();
STDMETHODIMP_(ULONG) Release();
// IMFAsyncCallback methods
STDMETHODIMP GetParameters(DWORD* pdwFlags, DWORD* pdwQueue)
{
// Implementation of this method is optional.
return E_NOTIMPL;
}
STDMETHODIMP Invoke(IMFAsyncResult *pResult);
// Other methods
HRESULT StartEncodingSession(IMFTopology *pTopology);
HRESULT GetEncodingPosition(MFTIME *pTime);
HRESULT Wait(DWORD dwMsec);
private:
CSession() : m_cRef(1), m_pSession(NULL), m_pClock(NULL), m_hrStatus(S_OK), m_hWaitEvent(NULL)
{
}
virtual ~CSession()
{
if (m_pSession)
{
m_pSession->Shutdown();
}
SafeRelease(&m_pClock);
SafeRelease(&m_pSession);
CloseHandle(m_hWaitEvent);
}
HRESULT Initialize();
private:
IMFMediaSession *m_pSession;
IMFPresentationClock *m_pClock;
HRESULT m_hrStatus;
HANDLE m_hWaitEvent;
long m_cRef;
};
The following code shows the complete implementation of the CSession
class.
HRESULT CSession::Create(CSession **ppSession)
{
*ppSession = NULL;
CSession *pSession = new (std::nothrow) CSession();
if (pSession == NULL)
{
return E_OUTOFMEMORY;
}
HRESULT hr = pSession->Initialize();
if (FAILED(hr))
{
pSession->Release();
return hr;
}
*ppSession = pSession;
return S_OK;
}
STDMETHODIMP CSession::QueryInterface(REFIID riid, void** ppv)
{
static const QITAB qit[] =
{
QITABENT(CSession, IMFAsyncCallback),
{ 0 }
};
return QISearch(this, qit, riid, ppv);
}
STDMETHODIMP_(ULONG) CSession::AddRef()
{
return InterlockedIncrement(&m_cRef);
}
STDMETHODIMP_(ULONG) CSession::Release()
{
long cRef = InterlockedDecrement(&m_cRef);
if (cRef == 0)
{
delete this;
}
return cRef;
}
HRESULT CSession::Initialize()
{
IMFClock *pClock = NULL;
HRESULT hr = MFCreateMediaSession(NULL, &m_pSession);
if (FAILED(hr))
{
goto done;
}
hr = m_pSession->GetClock(&pClock);
if (FAILED(hr))
{
goto done;
}
hr = pClock->QueryInterface(IID_PPV_ARGS(&m_pClock));
if (FAILED(hr))
{
goto done;
}
hr = m_pSession->BeginGetEvent(this, NULL);
if (FAILED(hr))
{
goto done;
}
m_hWaitEvent = CreateEvent(NULL, FALSE, FALSE, NULL);
if (m_hWaitEvent == NULL)
{
hr = HRESULT_FROM_WIN32(GetLastError());
}
done:
SafeRelease(&pClock);
return hr;
}
// Implements IMFAsyncCallback::Invoke
STDMETHODIMP CSession::Invoke(IMFAsyncResult *pResult)
{
IMFMediaEvent* pEvent = NULL;
MediaEventType meType = MEUnknown;
HRESULT hrStatus = S_OK;
HRESULT hr = m_pSession->EndGetEvent(pResult, &pEvent);
if (FAILED(hr))
{
goto done;
}
hr = pEvent->GetType(&meType);
if (FAILED(hr))
{
goto done;
}
hr = pEvent->GetStatus(&hrStatus);
if (FAILED(hr))
{
goto done;
}
if (FAILED(hrStatus))
{
hr = hrStatus;
goto done;
}
switch (meType)
{
case MESessionEnded:
hr = m_pSession->Close();
if (FAILED(hr))
{
goto done;
}
break;
case MESessionClosed:
SetEvent(m_hWaitEvent);
break;
}
if (meType != MESessionClosed)
{
hr = m_pSession->BeginGetEvent(this, NULL);
}
done:
if (FAILED(hr))
{
m_hrStatus = hr;
m_pSession->Close();
}
SafeRelease(&pEvent);
return hr;
}
HRESULT CSession::StartEncodingSession(IMFTopology *pTopology)
{
HRESULT hr = m_pSession->SetTopology(0, pTopology);
if (SUCCEEDED(hr))
{
PROPVARIANT varStart;
PropVariantClear(&varStart);
hr = m_pSession->Start(&GUID_NULL, &varStart);
}
return hr;
}
HRESULT CSession::GetEncodingPosition(MFTIME *pTime)
{
return m_pClock->GetTime(pTime);
}
HRESULT CSession::Wait(DWORD dwMsec)
{
HRESULT hr = S_OK;
DWORD dwTimeoutStatus = WaitForSingleObject(m_hWaitEvent, dwMsec);
if (dwTimeoutStatus != WAIT_OBJECT_0)
{
hr = E_PENDING;
}
else
{
hr = m_hrStatus;
}
return hr;
}
Related topics