Algorithm Implementation/Miscellaneous/Base64
The traditional (MIME) base64 encoding and decoding processes are fairly simple to implement. Here an example using JavaScript is given, including the MIME/etc required line breaks at particular line lengths. It is worth noting however, that many base64 functions (e.g. in PHP) return base64 encoded strings without the line breaks, as the line breaks can be inserted easily after encoding, and many times the base64 encoding is desired only for safely transferring data via XML or inserting into a database, etc. — times when the line breaks are known to be unnecessary and therefore undesirable. The newline inserting and removing in these functions here can easily be commented out (they are each only one line in the respective functions) if they are not needed.
An array of the base 64 characters is necessary for encoding, such as:
var base64chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'.split("");
And decoding will require the inverse list (swap the indices for the values), such as:
var base64inv = {};
for (var i = 0; i < base64chars.length; i++)
{
base64inv[base64chars[i]] = i;
}
Note that in real implementations, it is better to explicitly list the entire array/hash for each list above — the one-liners here are given to demonstrate the concept as directly as possible, rather than being the ideal in practice.
Encode
[edit | edit source]Javascript
[edit | edit source]The base64 encoding function:
function base64_encode (s)
{
// the result/encoded string, the padding string, and the pad count
var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
var r = "";
var p = "";
var c = s.length % 3;
// add a right zero pad to make this string a multiple of 3 characters
if (c > 0) {
for (; c < 3; c++) {
p += '=';
s += "\0";
}
}
// increment over the length of the string, three characters at a time
for (c = 0; c < s.length; c += 3) {
// we add newlines after every 76 output characters, according to the MIME specs
if (c > 0 && (c / 3 * 4) % 76 == 0) {
r += "\r\n";
}
// these three 8-bit (ASCII) characters become one 24-bit number
var n = (s.charCodeAt(c) << 16) + (s.charCodeAt(c+1) << 8) + s.charCodeAt(c+2);
// this 24-bit number gets separated into four 6-bit numbers
n = [(n >>> 18) & 63, (n >>> 12) & 63, (n >>> 6) & 63, n & 63];
// those four 6-bit numbers are used as indices into the base64 character list
r += base64chars[n[0]] + base64chars[n[1]] + base64chars[n[2]] + base64chars[n[3]];
}
// add the actual padding string, after removing the zero pad
return r.substring(0, r.length - p.length) + p;
}
Java
[edit | edit source]The base64 encoding function:
class Base64Encode {
private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
public static String encode(String s) {
// the result/encoded string, the padding string, and the pad count
String r = "", p = "";
int c = s.length() % 3;
// add a right zero pad to make this string a multiple of 3 characters
if (c > 0) {
for (; c < 3; c++) {
p += "=";
s += "\0";
}
}
// increment over the length of the string, three characters at a time
for (c = 0; c < s.length(); c += 3) {
// we add newlines after every 76 output characters, according to
// the MIME specs
if (c > 0 && (c / 3 * 4) % 76 == 0)
r += "\r\n";
// these three 8-bit (ASCII) characters become one 24-bit number
int n = (s.charAt(c) << 16) + (s.charAt(c + 1) << 8)
+ (s.charAt(c + 2));
// this 24-bit number gets separated into four 6-bit numbers
int n1 = (n >> 18) & 63, n2 = (n >> 12) & 63, n3 = (n >> 6) & 63, n4 = n & 63;
// those four 6-bit numbers are used as indices into the base64
// character list
r += "" + base64chars.charAt(n1) + base64chars.charAt(n2)
+ base64chars.charAt(n3) + base64chars.charAt(n4);
}
return r.substring(0, r.length() - p.length()) + p;
}
}
An alternate implementation could use Java Streams. For example, one could create a Base64EncoderStream, inheriting from FilterOutputStream.
Ruby
[edit | edit source]Base64 module documentation show the shortest way to encode a string:
irb(main):001:0> require "base64"
=> true
irb(main):002:0> enc = Base64.encode64('Send reinforcements')
=> "U2VuZCByZWluZm9yY2VtZW50cw==\n"
irb(main):003:0> plain = Base64.decode64(enc)
=> "Send reinforcements"
Ruby version, implemented as a tool that encodes the file specified on the command line:
BASE64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
File.open(ARGV[0], 'rb') do |file| # read file name passed on command line
c = 0 # count outputted characters
while (buf = file.read(3)) != nil do # read up to 3 bytes at a time
pad = '' # padding for this group of 4 characters
while buf.length < 3
buf << "\0"
pad << '='
end
group24 = (buf[0] << 16) | (buf[1] << 8) | buf[2] # current 3 bytes as a 24 bit value
encoded = BASE64_CHARS[(group24 >> 18) & 0x3f, 1] # read the 24 bit value 6 bits at a time
encoded << BASE64_CHARS[(group24 >> 12) & 0x3f, 1]
encoded << BASE64_CHARS[(group24 >> 6) & 0x3f, 1]
encoded << BASE64_CHARS[(group24 >> 0) & 0x3f, 1]
encoded[4 - pad.length, pad.length] = pad # add the padding
print encoded
print "\r\n" if (c += 4) % 76 == 0 # output newline ever 76th character
end
end
C
[edit | edit source]C version of the base64 encoding function:
#include <inttypes.h>
#include <string.h>
int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
{
const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
const uint8_t *data = (const uint8_t *)data_buf;
size_t resultIndex = 0;
size_t x;
uint32_t n = 0;
int padCount = dataLength % 3;
uint8_t n0, n1, n2, n3;
/* increment over the length of the string, three characters at a time */
for (x = 0; x < dataLength; x += 3)
{
/* these three 8-bit (ASCII) characters become one 24-bit number */
n = ((uint32_t)data[x]) << 16; //parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
if((x+1) < dataLength)
n += ((uint32_t)data[x+1]) << 8;//parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
if((x+2) < dataLength)
n += data[x+2];
/* this 24-bit number gets separated into four 6-bit numbers */
n0 = (uint8_t)(n >> 18) & 63;
n1 = (uint8_t)(n >> 12) & 63;
n2 = (uint8_t)(n >> 6) & 63;
n3 = (uint8_t)n & 63;
/*
* if we have one byte available, then its encoding is spread
* out over two characters
*/
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n0];
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n1];
/*
* if we have only two bytes available, then their encoding is
* spread out over three chars
*/
if((x+1) < dataLength)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n2];
}
/*
* if we have all three bytes available, then their encoding is spread
* out over four characters
*/
if((x+2) < dataLength)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = base64chars[n3];
}
}
/*
* create and add padding that is required if we did not have a multiple of 3
* number of characters available
*/
if (padCount > 0)
{
for (; padCount < 3; padCount++)
{
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex++] = '=';
}
}
if(resultIndex >= resultSize) return 1; /* indicate failure: buffer too small */
result[resultIndex] = 0;
return 0; /* indicate success */
}
C++
[edit | edit source]Below is a C++ version of Base64Encode. This code is released into public domain.
#include <string>
#include <vector>
// Prototype
// std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer);
// This line goes in header file
/* Define these if they aren't already in your environment
* #define TEXT(x) Lx //Unicode
* #define TCHAR wchar_t //Unicode
* #define TCHAR char //Not unicode
* #define TEXT(x) x //Not unicode
* #define DWORD long
* #define BYTE unsigned char
* They are defined by default in Windows.h
*/
//Lookup table for encoding
//If you want to use an alternate alphabet, change the characters here
const static TCHAR encodeLookup[] = TEXT("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
const static TCHAR padCharacter = TEXT('=');
std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer)
{
std::basic_string<TCHAR> encodedString;
encodedString.reserve(((inputBuffer.size()/3) + (inputBuffer.size() % 3 > 0)) * 4);
DWORD temp;
std::vector<BYTE>::iterator cursor = inputBuffer.begin();
for(size_t idx = 0; idx < inputBuffer.size()/3; idx++)
{
temp = (*cursor++) << 16; //Convert to big endian
temp += (*cursor++) << 8;
temp += (*cursor++);
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
encodedString.append(1,encodeLookup[(temp & 0x0000003F) ]);
}
switch(inputBuffer.size() % 3)
{
case 1:
temp = (*cursor++) << 16; //Convert to big endian
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(2,padCharacter);
break;
case 2:
temp = (*cursor++) << 16; //Convert to big endian
temp += (*cursor++) << 8;
encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
encodedString.append(1,padCharacter);
break;
}
return encodedString;
}
R
[edit | edit source]base64encode <- function(sobj) {
sstr <- as.character(sobj)
stopifnot(length(sstr) == 1) # we only like 1-entry string vectors for now
if (nchar(sstr) == 0) return("")
b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
shfts <- c(18,12,6,0)
sand <- function(n,s) bitwAnd(bitwShiftR(n,s),63)+1
slft <- function(p,n) bitwShiftL(as.integer(p),n)
subs <- function(s,n) substring(s,n,n)
sbit <- charToRaw(sstr)
npad <- ( 3 - length(sbit) %% 3) %% 3 # yeah.
sbit <- c(sbit,rep(as.raw(0),npad))
pces <- lapply(seq(1,length(sbit),by=3),function(ii) sbit[ii:(ii+2)])
encv <- paste(sapply(pces,function(p) paste0(sapply(shfts,function(s)(subs(b64c,sand(slft(p[1],16)+slft(p[2],8)+slft(p[3],0),s)))))),collapse="")
if (npad > 0) substr(encv,nchar(encv)-npad+1,nchar(encv)) <- paste(rep("=",npad),collapse="")
return(encv)
}
Decode
[edit | edit source]Javascript
[edit | edit source]The base64 decoding function:
function base64_decode (s)
{
var base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
// remove/ignore any characters not in the base64 characters list
// or the pad character -- particularly newlines
s = s.replace(new RegExp('[^'+base64chars.split("")+'=]', 'g'), "");
// replace any incoming padding with a zero pad (the 'A' character is zero)
var p = (s.charAt(s.length-1) == '=' ?
(s.charAt(s.length-2) == '=' ? 'AA' : 'A') : "");
var r = "";
s = s.substr(0, s.length - p.length) + p;
// increment over the length of this encoded string, four characters at a time
for (var c = 0; c < s.length; c += 4) {
// each of these four characters represents a 6-bit index in the base64 characters list
// which, when concatenated, will give the 24-bit number for the original 3 characters
var n = (base64chars.indexOf(s.charAt(c)) << 18) + (base64chars.indexOf(s.charAt(c+1)) << 12) +
(base64chars.indexOf(s.charAt(c+2)) << 6) + base64chars.indexOf(s.charAt(c+3));
// split the 24-bit number into the original three 8-bit (ASCII) characters
r += String.fromCharCode((n >>> 16) & 255, (n >>> 8) & 255, n & 255);
}
// remove any zero pad that was added to make this a multiple of 24 bits
return r.substring(0, r.length - p.length);
}
The above implementation is best with a language like JavaScript that handles string concatenation of arbitrary length strings very efficiently. Other languages (e.g. C) will work much more efficiently by allocating memory for a new string/array of the appropriate size (the output string length is easily calculated from the input string at the very beginning) and then simply setting each character index, as opposed to concatenation.
Java
[edit | edit source]The base64 decoding function:
class Base64Decode {
private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
public static String decode(String s) {
// remove/ignore any characters not in the base64 characters list
// or the pad character -- particularly newlines
s = s.replaceAll("[^" + base64chars + "=]", "");
// replace any incoming padding with a zero pad (the 'A' character is
// zero)
String p = (s.charAt(s.length() - 1) == '=' ?
(s.charAt(s.length() - 2) == '=' ? "AA" : "A") : "");
String r = "";
s = s.substring(0, s.length() - p.length()) + p;
// increment over the length of this encoded string, four characters
// at a time
for (int c = 0; c < s.length(); c += 4) {
// each of these four characters represents a 6-bit index in the
// base64 characters list which, when concatenated, will give the
// 24-bit number for the original 3 characters
int n = (base64chars.indexOf(s.charAt(c)) << 18)
+ (base64chars.indexOf(s.charAt(c + 1)) << 12)
+ (base64chars.indexOf(s.charAt(c + 2)) << 6)
+ base64chars.indexOf(s.charAt(c + 3));
// split the 24-bit number into the original three 8-bit (ASCII)
// characters
r += "" + (char) ((n >>> 16) & 0xFF) + (char) ((n >>> 8) & 0xFF)
+ (char) (n & 0xFF);
}
// remove any zero pad that was added to make this a multiple of 24 bits
return r.substring(0, r.length() - p.length());
}
}
An alternate implementation could use Java Streams. For example, one could create a Base64DecoderStream, inheriting from FilterInputStream.
C
[edit | edit source]Counterpart C decoder for the encoder above. This code is also public domain. This solution has been optimized using pointer math and a look-up table. This algorithm handles multiple encoding formats: with and without line breaks, with and without whitespace, and with and without padding characters.
#define WHITESPACE 64
#define EQUALS 65
#define INVALID 66
static const unsigned char d[] = {
66,66,66,66,66,66,66,66,66,66,64,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,62,66,66,66,63,52,53,
54,55,56,57,58,59,60,61,66,66,66,65,66,66,66, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,66,66,66,66,66,66,26,27,28,
29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
66,66,66,66,66,66
};
int base64decode (char *in, size_t inLen, unsigned char *out, size_t *outLen) {
char *end = in + inLen;
char iter = 0;
uint32_t buf = 0;
size_t len = 0;
while (in < end) {
unsigned char c = d[*in++];
switch (c) {
case WHITESPACE: continue; /* skip whitespace */
case INVALID: return 1; /* invalid input, return error */
case EQUALS: /* pad character, end of data */
in = end;
continue;
default:
buf = buf << 6 | c;
iter++; // increment the number of iteration
/* If the buffer is full, split it into bytes */
if (iter == 4) {
if ((len += 3) > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 16) & 255;
*(out++) = (buf >> 8) & 255;
*(out++) = buf & 255;
buf = 0; iter = 0;
}
}
}
if (iter == 3) {
if ((len += 2) > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 10) & 255;
*(out++) = (buf >> 2) & 255;
}
else if (iter == 2) {
if (++len > *outLen) return 1; /* buffer overflow */
*(out++) = (buf >> 4) & 255;
}
*outLen = len; /* modify to reflect the actual output size */
return 0;
}
C++
[edit | edit source]Counterpart C++ decoder for the encoder above. This code is also public domain. While a more general solution is possible by using a table similar to that in encoding, and then finding the index of the character we want in that table, and while that solution is much simpler, this version with the if-else-if ladder is faster because it does not need to run as many character comparisons to find the right character to use. The alphabet specific parts of this code are pretty much the same across most base64 alphabets however.
Items that need changed to accommodate the base64url encoding are commented as such.
#include <string>
#include <vector>
// Prototype
// std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input);
// This line goes in header file
/* Define these if they aren't already in your environment
* #define TEXT(x) Lx //Unicode
* #define TCHAR wchar_t //Unicode
* #define TCHAR char //Not unicode
* #define TEXT(x) x //Not unicode
* #define DWORD long
* #define BYTE unsigned char
* They are defined by default in Windows.h
*/
const static TCHAR padCharacter = TEXT('=');
std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input)
{
if (input.length() % 4) //Sanity check
throw std::runtime_error("Non-Valid base64!");
size_t padding = 0;
if (input.length())
{
if (input[input.length()-1] == padCharacter)
padding++;
if (input[input.length()-2] == padCharacter)
padding++;
}
//Setup a vector to hold the result
std::vector<BYTE> decodedBytes;
decodedBytes.reserve(((input.length()/4)*3) - padding);
DWORD temp=0; //Holds decoded quanta
std::basic_string<TCHAR>::const_iterator cursor = input.begin();
while (cursor < input.end())
{
for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++)
{
temp <<= 6;
if (*cursor >= 0x41 && *cursor <= 0x5A) // This area will need tweaking if
temp |= *cursor - 0x41; // you are using an alternate alphabet
else if (*cursor >= 0x61 && *cursor <= 0x7A)
temp |= *cursor - 0x47;
else if (*cursor >= 0x30 && *cursor <= 0x39)
temp |= *cursor + 0x04;
else if (*cursor == 0x2B)
temp |= 0x3E; //change to 0x2D for URL alphabet
else if (*cursor == 0x2F)
temp |= 0x3F; //change to 0x5F for URL alphabet
else if (*cursor == padCharacter) //pad
{
switch( input.end() - cursor )
{
case 1: //One pad character
decodedBytes.push_back((temp >> 16) & 0x000000FF);
decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
return decodedBytes;
case 2: //Two pad characters
decodedBytes.push_back((temp >> 10) & 0x000000FF);
return decodedBytes;
default:
throw std::runtime_error("Invalid Padding in Base 64!");
}
} else
throw std::runtime_error("Non-Valid Character in Base 64!");
cursor++;
}
decodedBytes.push_back((temp >> 16) & 0x000000FF);
decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
decodedBytes.push_back((temp ) & 0x000000FF);
}
return decodedBytes;
}
R
[edit | edit source]base64decode <- function(eobj) {
estr <- as.character(eobj)
stopifnot(length(estr) == 1)
b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
estr <- gsub(paste("[^",b64c,"=]"),"",estr)
if (nchar(estr) == 0) return("")
estr <- gsub("=","A",estr,fixed=T)
ebit <- charToRaw(estr); b64r <- charToRaw(b64c)
pces <- lapply(seq(1,length(ebit),by=4),function(ii) ebit[ii:(ii+3)])
slft <- function(p,n) bitwShiftL(match(p,b64r)-1,n) # note the -1
sand <- function(n,s) intToUtf8(bitwAnd(bitwShiftR(n,s),0xFF))
return(paste(sapply(pces,function(p){
n <- slft(p[1],18) + slft(p[2],12) + slft(p[3],6) + slft(p[4],0)
c(sand(n,16),sand(n,8),sand(n,0))
}),collapse=""))
}