OpenSceneGraph/src/osgPlugins/osgjs/json_stream

/*  -*-c++-*-
 *  Copyright (C) 2010 Cedric Pinson <cedric.pinson@plopbyte.net>
 *
 * This library is open source and may be redistributed and/or modified under
 * the terms of the OpenSceneGraph Public License (OSGPL) version 0.0 or
 * (at your option) any later version.  The full license is in LICENSE file
 * included with this distribution, and on the openscenegraph.org website.
 *
 * This library is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * OpenSceneGraph Public License for more details.
 *
 * Authors:
 *         Cedric Pinson <cedric.pinson@plopbyte.net>
 */

#ifndef JSON_STREAM
#define JSON_STREAM

#include <iostream>
#include <iomanip>
#include <cctype>  // control characters
#include <sstream>
#include <string>
#include <cmath>
#include <limits>

#include <osgDB/fstream>

using namespace std;

// A simple class wrapping ofstream calls to enable generic cleaning of json data.
// Especially 'standard' json should:
// * have utf-8 encoded string
// * disallow some control characters
// * does not support inf or nan values

#if defined(WIN32) && !defined(__MINGW32__) && (!defined(_MSC_VER) || _MSC_VER<=1700)
inline int isfinite( double x ) { return _finite( x ); }
inline int isinf( double x ) { return !_finite( x ) && !_isnan( x ); }
#endif


class json_stream : public osgDB::ofstream {
    public:
        json_stream(const std::string& filename, bool strict=true) :
            _stream(filename.c_str()),
            _strict(strict)
        {}

        ~json_stream() {
            _stream.close();
        }

        operator bool() const {
            return _stream.is_open();
        }

        // forward std::endl
        typedef std::ostream& (*ostream_manipulator)(std::ostream&);
        json_stream& operator<<(ostream_manipulator pf) {
            if (_stream.is_open()) {
                _stream << pf;
            }
            return *this;
        }

        template<typename T>
        json_stream& operator<<(const T& data) {
            if (_stream.is_open()) {
                _stream << sanitize(data);
            }
            return *this;
        }

        template<typename T>
        const T sanitize(const T& t) {
            return t;
        }

        double sanitize(const float f) {
            return sanitize(static_cast<double>(f));
        }

        double sanitize(const double d) {
            if(_strict) {
                return to_valid_float(d);
            }
            return d;
        }

        double to_valid_float(const double d) {
            if(isfinite(d)) {
                return d;
            }
            else {
                if(isinf(d)) {
                    return std::numeric_limits<double>::max();
                }
                // no much way to do better than replace invalid float NaN by 0
                return 0.;
            }
        }

        std::string sanitize(const std::string& s) {
            if(_strict) {
                return to_json_utf8(s);
            }
            return s;
        }

        std::string sanitize(const char* s) {
            return sanitize(std::string(s));
        }

        std::string to_json_utf8(const std::string& s) {
            // TODO: try to decode latin1 if string is not valid utf8
            // before actually fixing bad 'chars'
            return clean_invalid_utf8(s);
        }


    protected:
        std::ofstream _stream;
        bool _strict;

        std::string json_encode_control_char(int ctrl) {
            // see http://json.org
            std::ostringstream oss;

            if(ctrl == 8  || // \b
               ctrl == 9  || // \t
               ctrl == 10 || // \n
               ctrl == 12 || // \f
               ctrl == 13 || // \r
               ctrl == 27 || //
               ctrl == 34 || // \"
               ctrl == 47 // \/
              ) {
                oss << static_cast<char>(ctrl);
            }
            else {
                oss.fill('0');
                oss << "\\u" << std::setw(4) << std::hex << ctrl;
            }

            return oss.str();
        }


        inline bool is_valid_continuation_byte(unsigned int byte) {
            return ((byte & 0xC0) == 0x80);
        }

        inline int get_next_byte(std::string::const_iterator& iterator, std::string::const_iterator end_iterator) {
            if(iterator != end_iterator) {
                return *(++ iterator);
            }
            else {
                return 0; // invalid continuation byte
            }
        }

        // from http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
        std::string utf8_encode_codepoint(unsigned code_point)
        {
            std::string output;

            if(code_point > 0x590 && code_point < 0x5F4) {
                return output;
            }

            // out of range
            if(code_point > 1114112) {
                return utf8_encode_codepoint(0xfffd);
            }

            if (code_point < 0x80) {
                output.push_back(code_point);
            }
            else if (code_point <= 0x7FF) {
                output.push_back((code_point >> 6) + 0xC0);
                output.push_back((code_point & 0x3F) + 0x80);
            }
            else if (code_point <= 0xFFFF) {
                output.push_back((code_point >> 12) + 0xE0);
                output.push_back(((code_point >> 6) & 0x3F) + 0x80);
                output.push_back((code_point & 0x3F) + 0x80);
            }
            else if (code_point <= 0x10FFFF) {
                output.push_back((code_point >> 18) + 0xF0);
                output.push_back(((code_point >> 12) & 0x3F) + 0x80);
                output.push_back(((code_point >> 6) & 0x3F) + 0x80);
                output.push_back((code_point & 0x3F) + 0x80);
            }
            return output;
        }

        // from http://en.wikipedia.org/wiki/UTF-8#Invalid_byte_sequences
        std::string clean_invalid_utf8(const std::string& input,
                                       const int replacement_codepoint=0xfffd) {
            int code_unit1, code_unit2, code_unit3, code_unit4;
            std::string output, replacement = utf8_encode_codepoint(replacement_codepoint);

            for(std::string::const_iterator iterator = input.begin() ; iterator != input.end() ; ++ iterator) {
                code_unit1 = *iterator;
                if (code_unit1 < 0x80) {
                    if(std::iscntrl(code_unit1)) {
                        output += json_encode_control_char(code_unit1);
                    }
                    else {
                        output.push_back(code_unit1);
                    }
                }
                else if (code_unit1 < 0xC2) {
                    // continuation or overlong 2-byte sequence
                    output += replacement;
                }
                else if (code_unit1 < 0xE0) {
                    // 2-byte sequence
                    code_unit2 = get_next_byte(iterator, input.end());

                    if (!is_valid_continuation_byte(code_unit2)) {
                        output += replacement;
                        output += replacement;
                    }
                    else {
                        output += utf8_encode_codepoint((code_unit1 << 6) + code_unit2 - 0x3080);
                    }
                }
                else if (code_unit1 < 0xF0) {
                    // 3-byte sequence
                    code_unit2 = get_next_byte(iterator, input.end());

                    if (!is_valid_continuation_byte(code_unit2) ||
                        (code_unit1 == 0xE0 && code_unit2 < 0xA0)) /* overlong */ {
                        output += replacement;
                        output += replacement;
                    }
                    else {
                        code_unit3 = get_next_byte(iterator, input.end());

                        if (!is_valid_continuation_byte(code_unit3)) {
                            output += replacement;
                            output += replacement;
                            output += replacement;
                        }
                        else {
                            output += utf8_encode_codepoint((code_unit1 << 12) +
                                                            (code_unit2 << 6) +
                                                            code_unit3 - 0xE2080);
                        }
                    }
                }
                else if (code_unit1 < 0xF5) {
                    // 4-byte sequence
                    code_unit2 = get_next_byte(iterator, input.end());
                    if(!is_valid_continuation_byte(code_unit2) ||
                       (code_unit1 == 0xF0 && code_unit2 < 0x90) || /* overlong */
                       (code_unit1 == 0xF4 && code_unit2 >= 0x90)) {  /* > U+10FFFF */
                        output += replacement;
                        output += replacement;
                    }
                    else {
                        code_unit3 = get_next_byte(iterator, input.end());
                        if(!is_valid_continuation_byte(code_unit3)) {
                                output += replacement;
                                output += replacement;
                                output += replacement;
                        }
                        else {
                            code_unit4 = get_next_byte(iterator, input.end());
                            if(!is_valid_continuation_byte(code_unit4)) {
                                output += replacement;
                                output += replacement;
                                output += replacement;
                                output += replacement;
                            }
                            else {
                                output += utf8_encode_codepoint((code_unit1 << 18) +
                                                                (code_unit2 << 12) +
                                                                (code_unit3 << 6) +
                                                                code_unit4 - 0x3C82080);
                            }
                        }
                    }
                }
                else {
                    /* > U+10FFFF */
                    output += replacement;
                }
            }
            return output;
        }
};


#endif