I recently started writing an application in Node.js that dealt with reading in raw data from a file, did some action on it, then send the data over http connection in HTTP body as multipart/binary. Until now I always dealt with text and strings. Through data validation on bit level I've learned the hard way what creating type mismatches does to variables and how it compromises data integrity. Almost all examples I've come across online assume one is dealing with strings, and none when dealing with raw binary data. This article is product on my mishaps, and thorough analysis by colleagues who provided much valuable insight into Node.js rules.

Let's start with a simple example, where we define several variables of different type, then use += operator to concatenate string data to these variables.

"use strict"; 
var fs = require('fs'); 
var mydata = 'somedata'; 
var tmpvar1; 
var tmpvar2 = ''; 
var tmpvar3 = null; 
var tmpvar4 = []; 
var tmpvar5 = {}; 
var tmpvar6 = null; 
var tmpvar7 = 0;

tmpvar1 += mydata; 
tmpvar2 += mydata; 
tmpvar3 += mydata; 
tmpvar4 += mydata; 
tmpvar5 += mydata; 
tmpvar6 = mydata; 
tmpvar7 += mydata;

console.log('length of mydata is: ',mydata.length,' , length of tmpvar1 is: ',tmpvar1.length, ' , tmpvar1 contents are: '+tmpvar1); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar2 as \'\' is: ',tmpvar2.length, ' , tmpvar2 contents are: '+tmpvar2); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar3 as null is: ',tmpvar3.length, ' , tmpvar3 contents are: '+tmpvar3); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar4 as \[\] is: ',tmpvar4.length, ' , tmpvar4 contents are: '+tmpvar4); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar5 as \{\} is: ',tmpvar5.length, ' , tmpvar5 contents are: '+tmpvar5); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar6 as assigned (not appended) is: ',tmpvar6.length, ' , tmpvar6 contents are: '+tmpvar6); 
console.log('length of mydata is: ',mydata.length,' , length of tmpvar7 as 0 is: ',tmpvar7.length, ' , tmpvar7 contents are: '+tmpvar7);

When ran this is the output it produces. Comparing length of variables, contents of the variables, and correlating that to type definition for each tmpvar we can explain what is occuring. Inline are comments on why things are behaving this way.

length of mydata is: 8 , length of tmpvar1 is: 17 , tmpvar1 contents are: undefinedsomedata

The first concern is normal Javascript behavior (undefined variable += string). Javascript attempts to be smart about type casting into what you are asking the script to do, so in this case add a string to an undefined variable. Undefined value is being converted to a string, and concatenated with mydata. The primitive type undefined will presume a value of string “undefined” when you attempt to add any string to it. In an actual program, it wouldd probably make sense to either 

  • 1) initialize the variable to a null string or
  • 2) do a type check for undefined if(typeof tmpvar1 === ‘undefined’) and just assign the variable directly to the first string value { tmpvar1 = mydata; }

length of mydata is: 8 , length of tmpvar2 as '' is: 8 , tmpvar2 contents are: somedata       
length of mydata is: 8 , length of tmpvar3 as null is: 12 , tmpvar3 contents are: nullsomedata 
length of mydata is: 8 , length of tmpvar4 as [] is: 8 , tmpvar4 contents are: somedata 
length of mydata is: 8 , length of tmpvar5 as {} is: 23 , tmpvar5 contents are: [object Object]somedata 
length of mydata is: 8 , length of tmpvar6 as assigned (not appended) is: 8 , tmpvar6 contents are: somedata 

Assigning a string causes the resulting variable type to be a string.

length of mydata is: 8 , length of tmpvar7 as 0 is: 9 , tmpvar7 contents are: 0somedata                

0 is converted to string and concatenated

 

Now our example is changed to read raw binary data from a file:

var t1 = 1; 
var tmpvar21; 
var tmpvar22 = ''; 
var tmpvar23 = null; 
var tmpvar24 = []; 
var tmpvar25 = {}; 
var tmpvar26 = null; 
var tmpvar27 = 0;

var fsread = fs.createReadStream("file.sample", { end: false }); // file.samle is any binary file larger then 64KB. 
fsread.on('error',function(e){ 
   console.log('debug -- got file read error: ',e); 
}).on('readable', function() { 
   if(t1 == 1) { var chunk = fsread.read(); t1 = 0; } // Reads in a chunk from file, chunk size is default 
   else { var chunk = fsread.read(20); t1 = 1;} //Reads in a chunk from file, chunk size is 20 
   tmpvar21 += chunk; 
   tmpvar22 += chunk; 
   tmpvar23 += chunk; 
   tmpvar24 += chunk; 
   tmpvar25 += chunk; 
   tmpvar26 = chunk; 
   tmpvar27 += chunk; 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar21 is: ',tmpvar21.length); 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar22 as \'\' is: ',tmpvar22.length); 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar23 as null is: ',tmpvar23.length); 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar24 as \[\] is: ',tmpvar24.length);  
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar25 as \{\} is: ',tmpvar25.length); 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar26 as assigned (not appended) is: ',tmpvar26.length); 
   console.log('length of chunk is: ',chunk.length,' , length of tmpvar27 as 0 is: ',tmpvar27.length); 
   if(t1) { process.exit(0); } 
}).on('end', function() { process.exit(1); })

 

Output I get running node v0.12 is:

length of chunk is: 65536 , length of tmpvar21 is: 65544 
length of chunk is: 65536 , length of tmpvar22 as '' is: 65535

Since we have not called fsread.setEncoding(), fs.read() is returning a buffer. Hence this is a string + buffer operation, or interpreted by node as string + buffer.toString(). This indicates that the toString() on the buffer returns 65535 characters, from 65536 bytes. Since data read in is raw binary, guess is that we have a non UTF8 character that gets removes when converted to a string.

length of chunk is: 65536 , length of tmpvar23 as null is: 65539
length of chunk is: 65536 , length of tmpvar24 as [] is: 65535
length of chunk is: 65536 , length of tmpvar25 as {} is: 65550
length of chunk is: 65536 , length of tmpvar26 as assigned (not appended) is: 65536
length of chunk is: 65536 , length of tmpvar27 as 0 is: 65536

This is number + buffer. Looks like both are converted to strings, the length will be one more then tmpvar22, which it is.

length of chunk is: 20 , length of tmpvar21 is: 65564
length of chunk is: 20 , length of tmpvar22 as '' is: 65555
length of chunk is: 20 , length of tmpvar23 as null is: 65559
length of chunk is: 20 , length of tmpvar24 as [] is: 65555
length of chunk is: 20 , length of tmpvar25 as {} is: 65570
length of chunk is: 20 , length of tmpvar26 as assigned (not appended) is: 20
length of chunk is: 20 , length of tmpvar27 as 0 is: 65556

Lesson here is do not mix variables with different type definitions, and if you do ensure you are getting the result you want!

So how do we deal with raw data, if there is no raw data variable type. Node.js uses Buffer class for this.

If you plan to use a buffer variable type to append data to you need to initialize it with new Buffer(0). Also note that using += operator to append Buffer data containing raw binary data does not work. We need to use Buffer.concat() for this.

Here is sample code:

var mybuff = new Buffer(0);
var fsread = fs.createReadStream("file.sample");
fsread.on('error',function(e){
    console.log(‘Error reading file: ‘,e);
}).on(‘data’, function(chunk) {
    mybuff = Buffer.concat([mybuff,chunk]);
}).on('end', function() { process.exit(1); });

If you have a large amount of raw data you want to read in, then take action on, suggestion is not to use Buffer.concat() to create one large buffer. Instead, for better performance push the data into an array and iterate through array elements at the end. If at all possible deal with the data on the spot avoiding having to cache it, making your app more dynamic and less dependent on memory resources. Certainly, if you are just reading and writing raw data from streams(filesystem to HTTP, or vice versa), using Node.js stream.pipe() is the way to do it.

var myarray = [];
var fsread = fs.createReadStream("file.sample");
fsread.on('error',function(e){
    console.log(‘Error reading file: ‘,e);
}).on(‘data’, function(chunk) {
    myarray.push(chunk);
}).on('end', function() { process.exit(1); });