Algorithm Implementation/Miscellaneous/Base64

From Wikibooks, open books for an open world
< Algorithm Implementation‎ | Miscellaneous
Jump to: navigation, search

The traditional (MIME) base64 encoding and decoding processes are fairly simple to implement. Here an example using JavaScript is given, including the MIME/etc required line breaks at particular line lengths. It is worth noting however, that many base64 functions (e.g. in PHP) return base64 encoded strings without the line breaks, as the line breaks can be inserted easily after encoding, and many times the base64 encoding is desired only for safely transferring data via XML or inserting into a database, etc. — times when the line breaks are known to be unnecessary and therefore undesirable. The newline inserting and removing in these functions here can easily be commented out (they are each only one line in the respective functions) if they are not needed.

An array of the base 64 characters is necessary for encoding, such as:

 var base64chars = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'.split("");

And decoding will require the inverse list (swap the indices for the values), such as:

var base64inv = {}; 
for (var i = 0; i < base64chars.length; i++) 
{ 
  base64inv[base64chars[i]] = i; 
}

Note that in real implementations, it is better to explicitly list the entire array/hash for each list above — the one-liners here are given to demonstrate the concept as directly as possible, rather than being the ideal in practice.

Encode[edit]

Javascript[edit]

The base64 encoding function:

function base64_encode (s)
{
  // the result/encoded string, the padding string, and the pad count
  var r = ""; 
  var p = ""; 
  var c = s.length % 3;
 
  // add a right zero pad to make this string a multiple of 3 characters
  if (c > 0) { 
    for (; c < 3; c++) { 
      p += '='; 
      s += "\0"; 
    } 
  }
 
  // increment over the length of the string, three characters at a time
  for (c = 0; c < s.length; c += 3) {
 
    // we add newlines after every 76 output characters, according to the MIME specs
    if (c > 0 && (c / 3 * 4) % 76 == 0) { 
      r += "\r\n"; 
    }
 
    // these three 8-bit (ASCII) characters become one 24-bit number
    var n = (s.charCodeAt(c) << 16) + (s.charCodeAt(c+1) << 8) + s.charCodeAt(c+2);
 
    // this 24-bit number gets separated into four 6-bit numbers
    n = [(n >>> 18) & 63, (n >>> 12) & 63, (n >>> 6) & 63, n & 63];
 
    // those four 6-bit numbers are used as indices into the base64 character list
    r += base64chars[n[0]] + base64chars[n[1]] + base64chars[n[2]] + base64chars[n[3]];
  }
   // add the actual padding string, after removing the zero pad
  return r.substring(0, r.length - p.length) + p;
}

Java[edit]

The base64 encoding function:

class Base64Encode {
    private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
    public static String encode(String s) {
 
	// the result/encoded string, the padding string, and the pad count
	String r = "", p = "";
	int c = s.length() % 3;
 
	// add a right zero pad to make this string a multiple of 3 characters
	if (c > 0) {
	    for (; c < 3; c++) {
		p += "=";
		s += "\0";
	    }
	}
 
	// increment over the length of the string, three characters at a time
	for (c = 0; c < s.length(); c += 3) {
 
	    // we add newlines after every 76 output characters, according to
	    // the MIME specs
	    if (c > 0 && (c / 3 * 4) % 76 == 0)
		r += "\r\n";
 
	    // these three 8-bit (ASCII) characters become one 24-bit number
	    int n = (s.charAt(c) << 16) + (s.charAt(c + 1) << 8)
		    + (s.charAt(c + 2));
 
	    // this 24-bit number gets separated into four 6-bit numbers
	    int n1 = (n >> 18) & 63, n2 = (n >> 12) & 63, n3 = (n >> 6) & 63, n4 = n & 63;
 
	    // those four 6-bit numbers are used as indices into the base64
	    // character list
	    r += "" + base64chars.charAt(n1) + base64chars.charAt(n2)
		    + base64chars.charAt(n3) + base64chars.charAt(n4);
	}
 
	return r.substring(0, r.length() - p.length()) + p;
    }
}

An alternate implementation could use Java Streams. For example one could create a Base64EncoderStream, inheriting from FilterOutputStream.

Ruby[edit]

Ruby version, implemented as a tool that encodes the file specified on the command line:

BASE64_CHARS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
File.open(ARGV[0], 'rb') do |file| # read file name passed on command line
	c = 0 # count outputted characters
	while (buf = file.read(3)) != nil do # read up to 3 bytes at a time
		pad = '' # padding for this group of 4 characters
		while buf.length < 3
			buf << "\0"
			pad << '='
		end
		group24 = (buf[0] << 16) | (buf[1] << 8) | buf[2] # current 3 bytes as a 24 bit value
		encoded = BASE64_CHARS[(group24 >> 18) & 0x3f, 1] # read the 24 bit value 6 bits at a time
		encoded << BASE64_CHARS[(group24 >> 12) & 0x3f, 1]
		encoded << BASE64_CHARS[(group24 >> 6) & 0x3f, 1]
		encoded << BASE64_CHARS[(group24 >> 0) & 0x3f, 1]
		encoded[4 - pad.length, pad.length] = pad # add the padding
		print encoded
		print "\r\n" if (c += 4) % 76 == 0 # output newline ever 76th character
	end
end

C[edit]

C version of the base64 encoding function:

#include <inttypes.h>
#include <string.h>
 
int base64encode(const void* data_buf, size_t dataLength, char* result, size_t resultSize)
{
   const char base64chars[] = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
   const uint8_t *data = (const uint8_t *)data_buf;
   size_t resultIndex = 0;
   size_t x;
   uint32_t n = 0;
   int padCount = dataLength % 3;
   uint8_t n0, n1, n2, n3;
 
   /* increment over the length of the string, three characters at a time */
   for (x = 0; x < dataLength; x += 3) 
   {
      /* these three 8-bit (ASCII) characters become one 24-bit number */
      n = ((uint32_t)data[x]) << 16; //parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
 
      if((x+1) < dataLength)
         n += ((uint32_t)data[x+1]) << 8;//parenthesis needed, compiler depending on flags can do the shifting before conversion to uint32_t, resulting to 0
 
      if((x+2) < dataLength)
         n += data[x+2];
 
      /* this 24-bit number gets separated into four 6-bit numbers */
      n0 = (uint8_t)(n >> 18) & 63;
      n1 = (uint8_t)(n >> 12) & 63;
      n2 = (uint8_t)(n >> 6) & 63;
      n3 = (uint8_t)n & 63;
 
      /*
       * if we have one byte available, then its encoding is spread
       * out over two characters
       */
      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
      result[resultIndex++] = base64chars[n0];
      if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
      result[resultIndex++] = base64chars[n1];
 
      /*
       * if we have only two bytes available, then their encoding is
       * spread out over three chars
       */
      if((x+1) < dataLength)
      {
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = base64chars[n2];
      }
 
      /*
       * if we have all three bytes available, then their encoding is spread
       * out over four characters
       */
      if((x+2) < dataLength)
      {
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = base64chars[n3];
      }
   }  
 
   /*
    * create and add padding that is required if we did not have a multiple of 3
    * number of characters available
    */
   if (padCount > 0) 
   { 
      for (; padCount < 3; padCount++) 
      { 
         if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
         result[resultIndex++] = '=';
      } 
   }
   if(resultIndex >= resultSize) return 1;   /* indicate failure: buffer too small */
   result[resultIndex] = 0;
   return 0;   /* indicate success */
}

C++[edit]

Below is a C++ version of Base64Encode. This code is released into public domain.

#include <string>
#include <vector>
 
// Prototype
// std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer);
// This line goes in header file
 
/* Define these if they aren't already in your environment
 * #define TEXT(x) Lx    //Unicode
 * #define TCHAR wchar_t //Unicode
 * #define TCHAR char    //Not unicode
 * #define TEXT(x) x     //Not unicode
 * #define DWORD long
 * #define BYTE unsigned char
 * They are defined by default in Windows.h
 */
 
//Lookup table for encoding
//If you want to use an alternate alphabet, change the characters here
const static TCHAR encodeLookup[] = TEXT("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/");
const static TCHAR padCharacter = TEXT('=');
std::basic_string<TCHAR> base64Encode(std::vector<BYTE> inputBuffer)
{
	std::basic_string<TCHAR> encodedString;
	encodedString.reserve(((inputBuffer.size()/3) + (inputBuffer.size() % 3 > 0)) * 4);
	DWORD temp;
	std::vector<BYTE>::iterator cursor = inputBuffer.begin();
	for(size_t idx = 0; idx < inputBuffer.size()/3; idx++)
	{
		temp  = (*cursor++) << 16; //Convert to big endian
		temp += (*cursor++) << 8;
		temp += (*cursor++);
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
		encodedString.append(1,encodeLookup[(temp & 0x0000003F)      ]);
	}
	switch(inputBuffer.size() % 3)
	{
	case 1:
		temp  = (*cursor++) << 16; //Convert to big endian
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(2,padCharacter);
		break;
	case 2:
		temp  = (*cursor++) << 16; //Convert to big endian
		temp += (*cursor++) << 8;
		encodedString.append(1,encodeLookup[(temp & 0x00FC0000) >> 18]);
		encodedString.append(1,encodeLookup[(temp & 0x0003F000) >> 12]);
		encodedString.append(1,encodeLookup[(temp & 0x00000FC0) >> 6 ]);
		encodedString.append(1,padCharacter);
		break;
	}
	return encodedString;
}

R[edit]

base64encode <- function(sobj) {
  sstr <- as.character(sobj)
  stopifnot(length(sstr) == 1) # we only like 1-entry string vectors for now
  if (nchar(sstr) == 0) return("")
  b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  shfts <- c(18,12,6,0)
  sand <- function(n,s) bitwAnd(bitwShiftR(n,s),63)+1
  slft <- function(p,n) bitwShiftL(as.integer(p),n)
  subs <- function(s,n) substring(s,n,n)
  sbit <- charToRaw(sstr)  
  npad <- ( 3 - length(sbit) %% 3) %% 3 # yeah.
  sbit <- c(sbit,as.raw(rep(0,npad)))
  pces <- lapply(seq(1,length(sbit),by=3),function(ii) sbit[ii:(ii+2)])
  encv <- paste0(sapply(pces,function(p) paste0(sapply(shfts,function(s)(subs(b64c,sand(slft(p[1],16)+slft(p[2],8)+slft(p[3],0),s)))))),collapse="")
  if (npad > 0) substr(encv,nchar(encv)-npad+1,nchar(encv)) <- paste0(rep("=",npad),collapse="")
  return(encv)
}

Decode[edit]

Javascript[edit]

The base64 decoding function:

function base64_decode (s)
{
  // remove/ignore any characters not in the base64 characters list
  //  or the pad character -- particularly newlines
  s = s.replace(new RegExp('[^'+base64chars.join("")+'=]', 'g'), "");
 
  // replace any incoming padding with a zero pad (the 'A' character is zero)
  var p = (s.charAt(s.length-1) == '=' ? 
          (s.charAt(s.length-2) == '=' ? 'AA' : 'A') : ""); 
  var r = ""; 
  s = s.substr(0, s.length - p.length) + p;
 
  // increment over the length of this encoded string, four characters at a time
  for (var c = 0; c < s.length; c += 4) {
 
    // each of these four characters represents a 6-bit index in the base64 characters list
    //  which, when concatenated, will give the 24-bit number for the original 3 characters
    var n = (base64inv[s.charAt(c)] << 18) + (base64inv[s.charAt(c+1)] << 12) +
            (base64inv[s.charAt(c+2)] << 6) + base64inv[s.charAt(c+3)];
 
    // split the 24-bit number into the original three 8-bit (ASCII) characters
    r += String.fromCharCode((n >>> 16) & 255, (n >>> 8) & 255, n & 255);
  }
   // remove any zero pad that was added to make this a multiple of 24 bits
  return r.substring(0, r.length - p.length);
}

The above implementation is best with a language like JavaScript that handles string concatenation of arbitrary length strings very efficiently. Other languages (e.g. C) will work much more efficiently by allocating memory for a new string/array of the appropriate size (the output string length is easily calculated from the input string at the very beginning) and then simply setting each character index, as opposed to concatenation.

Algorithm Implementation/Miscellaneous/Base64===Java=== The base64 decoding function:

class Base64Decode {
    private final static String base64chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
    public static String decode(String s) {
 
	// remove/ignore any characters not in the base64 characters list
	// or the pad character -- particularly newlines
	s = s.replaceAll("[^" + base64chars + "=]", "");
 
	// replace any incoming padding with a zero pad (the 'A' character is
	// zero)
	String p = (s.charAt(s.length() - 1) == '=' ? 
		(s.charAt(s.length() - 2) == '=' ? "AA" : "A") : "");
	String r = "";
	s = s.substring(0, s.length() - p.length()) + p;
 
	// increment over the length of this encoded string, four characters
	// at a time
	for (int c = 0; c < s.length(); c += 4) {
 
	    // each of these four characters represents a 6-bit index in the
	    // base64 characters list which, when concatenated, will give the
	    // 24-bit number for the original 3 characters
	    int n = (base64chars.indexOf(s.charAt(c)) << 18)
		    + (base64chars.indexOf(s.charAt(c + 1)) << 12)
		    + (base64chars.indexOf(s.charAt(c + 2)) << 6)
		    + base64chars.indexOf(s.charAt(c + 3));
 
	    // split the 24-bit number into the original three 8-bit (ASCII)
	    // characters
	    r += "" + (char) ((n >>> 16) & 0xFF) + (char) ((n >>> 8) & 0xFF)
		    + (char) (n & 0xFF);
	}
 
	// remove any zero pad that was added to make this a multiple of 24 bits
	return r.substring(0, r.length() - p.length());
    }
}

An alternate implementation could use Java Streams. For example one could create a Base64DecoderStream, inheriting from FilterInputStream.

C[edit]

Counterpart C decoder for the encoder above. This code is also public domain. This solution has been optimized using pointer math and a look-up table. This algorithm handles multiple encoding formats: with and without line breaks, with and without whitespace, and with and without padding characters.

#define WHITESPACE 64
#define EQUALS     65
#define INVALID    66
 
static const unsigned char d[] = {
    66,66,66,66,66,66,66,66,66,64,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,62,66,66,66,63,52,53,
    54,55,56,57,58,59,60,61,66,66,66,65,66,66,66, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
    10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,66,66,66,66,66,66,26,27,28,
    29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,66,
    66,66,66,66,66,66
};
 
int base64decode (char *in, size_t inLen, unsigned char *out, size_t *outLen) { 
    char *end = in + inLen;
    size_t buf = 1, len = 0;
 
    while (in < end) {
        unsigned char c = d[*in++];
 
        switch (c) {
        case WHITESPACE: continue;   /* skip whitespace */
        case INVALID:    return 1;   /* invalid input, return error */
        case EQUALS:                 /* pad character, end of data */
            in = end;
            continue;
        default:
            buf = buf << 6 | c;
 
            /* If the buffer is full, split it into bytes */
            if (buf & 0x1000000) {
                if ((len += 3) > *outLen) return 1; /* buffer overflow */
                *out++ = buf >> 16;
                *out++ = buf >> 8;
                *out++ = buf;
                buf = 1;
            }   
        }
    }
 
    if (buf & 0x40000) {
        if ((len += 2) > *outLen) return 1; /* buffer overflow */
        *out++ = buf >> 10;
        *out++ = buf >> 2;
    }
    else if (buf & 0x1000) {
        if (++len > *outLen) return 1; /* buffer overflow */
        *out++ = buf >> 4;
    }
 
    *outLen = len; /* modify to reflect the actual output size */
    return 0;
}

C++[edit]

Counterpart C++ decoder for the encoder above. This code is also public domain. While a more general solution is possible by using a table similar to that in encoding, and then finding the index of the character we want in that table, and while that solution is much simpler, this version with the if-else-if ladder is faster because it does not need to run as many character comparisons to find the right character to use. The alphabet specific parts of this code are pretty much the same across most base64 alphabets however.

Items that need changed to accommodate the base64url encoding are commented as such.

#include <string>
#include <vector>
 
// Prototype
// std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input);
// This line goes in header file
 
/* Define these if they aren't already in your environment
 * #define TEXT(x) Lx    //Unicode
 * #define TCHAR wchar_t //Unicode
 * #define TCHAR char    //Not unicode
 * #define TEXT(x) x     //Not unicode
 * #define DWORD long
 * #define BYTE unsigned char
 * They are defined by default in Windows.h
 */
 
const static TCHAR padCharacter = TEXT('=');
std::vector<BYTE> base64Decode(const std::basic_string<TCHAR>& input)
{
	if (input.length() % 4) //Sanity check
		throw std::runtime_error("Non-Valid base64!");
	size_t padding = 0;
	if (input.length())
	{
		if (input[input.length()-1] == padCharacter)
			padding++;
		if (input[input.length()-2] == padCharacter)
			padding++;
	}
	//Setup a vector to hold the result
	std::vector<BYTE> decodedBytes;
	decodedBytes.reserve(((input.length()/4)*3) - padding);
	DWORD temp=0; //Holds decoded quanta
	std::basic_string<TCHAR>::const_iterator cursor = input.begin();
	while (cursor < input.end())
	{
		for (size_t quantumPosition = 0; quantumPosition < 4; quantumPosition++)
		{
			temp <<= 6;
			if       (*cursor >= 0x41 && *cursor <= 0x5A) // This area will need tweaking if
				temp |= *cursor - 0x41;		              // you are using an alternate alphabet
			else if  (*cursor >= 0x61 && *cursor <= 0x7A)
				temp |= *cursor - 0x47;
			else if  (*cursor >= 0x30 && *cursor <= 0x39)
				temp |= *cursor + 0x04;
			else if  (*cursor == 0x2B)
				temp |= 0x3E; //change to 0x2D for URL alphabet
			else if  (*cursor == 0x2F)
				temp |= 0x3F; //change to 0x5F for URL alphabet
			else if  (*cursor == padCharacter) //pad
			{
				switch( input.end() - cursor )
				{
				case 1: //One pad character
					decodedBytes.push_back((temp >> 16) & 0x000000FF);
					decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
					return decodedBytes;
				case 2: //Two pad characters
					decodedBytes.push_back((temp >> 10) & 0x000000FF);
					return decodedBytes;
				default:
					throw std::runtime_error("Invalid Padding in Base 64!");
				}
			}  else
				throw std::runtime_error("Non-Valid Character in Base 64!");
			cursor++;
		}
		decodedBytes.push_back((temp >> 16) & 0x000000FF);
		decodedBytes.push_back((temp >> 8 ) & 0x000000FF);
		decodedBytes.push_back((temp      ) & 0x000000FF);
	}
	return decodedBytes;
}

R[edit]

base64decode <- function(eobj) {
  estr <- as.character(eobj)
  stopifnot(length(estr) == 1)
  b64c <- "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"
  estr <- gsub(paste0("[^",b64c,"=]"),"",estr)
  if (nchar(estr) == 0) return("")
  estr <- gsub("=","A",estr,fixed=T)
  ebit <- charToRaw(estr); b64r <- charToRaw(b64c)
  pces <- lapply(seq(1,length(ebit),by=4),function(ii) ebit[ii:(ii+3)])
  slft <- function(p,n) bitwShiftL(match(p,b64r)-1,n) # note the -1
  sand <- function(n,s) intToUtf8(bitwAnd(bitwShiftR(n,s),0xFF))
  return(paste0(sapply(pces,function(p){
    n <- slft(p[1],18) + slft(p[2],12) + slft(p[3],6) + slft(p[4],0)
    c(sand(n,16),sand(n,8),sand(n,0))
  }),collapse=""))
}