codecvt_specializations.h

Go to the documentation of this file.
00001 // Locale support (codecvt) -*- C++ -*-
00002 
00003 // Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006
00004 //  Free Software Foundation, Inc.
00005 //
00006 // This file is part of the GNU ISO C++ Library.  This library is free
00007 // software; you can redistribute it and/or modify it under the
00008 // terms of the GNU General Public License as published by the
00009 // Free Software Foundation; either version 2, or (at your option)
00010 // any later version.
00011 
00012 // This library is distributed in the hope that it will be useful,
00013 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00014 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00015 // GNU General Public License for more details.
00016 
00017 // You should have received a copy of the GNU General Public License along
00018 // with this library; see the file COPYING.  If not, write to the Free
00019 // Software Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301,
00020 // USA.
00021 
00022 // As a special exception, you may use this file as part of a free software
00023 // library without restriction.  Specifically, if other files instantiate
00024 // templates or use macros or inline functions from this file, or you compile
00025 // this file and link it with other files to produce an executable, this
00026 // file does not by itself cause the resulting executable to be covered by
00027 // the GNU General Public License.  This exception does not however
00028 // invalidate any other reasons why the executable file might be covered by
00029 // the GNU General Public License.
00030 
00031 //
00032 // ISO C++ 14882: 22.2.1.5 Template class codecvt
00033 //
00034 
00035 // Written by Benjamin Kosnik <bkoz@redhat.com>
00036 
00037 /** @file ext/codecvt_specializations.h
00038  *  This file is a GNU extension to the Standard C++ Library.
00039  */
00040 
00041 #ifndef _EXT_CODECVT_SPECIALIZATIONS_H
00042 #define _EXT_CODECVT_SPECIALIZATIONS_H 1
00043 
00044 #include <bits/c++config.h>
00045 
00046 #ifdef _GLIBCXX_USE_ICONV
00047 
00048 #include <locale>
00049 #include <iconv.h>
00050 
00051   // XXX
00052   // Define this here so codecvt.cc can have _S_max_size definition.
00053 #define _GLIBCXX_USE_ENCODING_STATE 1
00054 
00055 _GLIBCXX_BEGIN_NAMESPACE(__gnu_cxx)
00056 
00057   /// @brief  Extension to use icov for dealing with character encodings.
00058   // This includes conversions and comparisons between various character
00059   // sets.  This object encapsulates data that may need to be shared between
00060   // char_traits, codecvt and ctype.
00061   class encoding_state
00062   {
00063   public:
00064     // Types: 
00065     // NB: A conversion descriptor subsumes and enhances the
00066     // functionality of a simple state type such as mbstate_t.
00067     typedef iconv_t descriptor_type;
00068     
00069   protected:
00070     // Name of internal character set encoding.
00071     std::string         _M_int_enc;
00072 
00073     // Name of external character set encoding.
00074     std::string     _M_ext_enc;
00075 
00076     // Conversion descriptor between external encoding to internal encoding.
00077     descriptor_type _M_in_desc;
00078 
00079     // Conversion descriptor between internal encoding to external encoding.
00080     descriptor_type _M_out_desc;
00081 
00082     // The byte-order marker for the external encoding, if necessary.
00083     int         _M_ext_bom;
00084 
00085     // The byte-order marker for the internal encoding, if necessary.
00086     int         _M_int_bom;
00087 
00088     // Number of external bytes needed to construct one complete
00089     // character in the internal encoding.
00090     // NB: -1 indicates variable, or stateful, encodings.
00091     int         _M_bytes;
00092 
00093   public:
00094     explicit 
00095     encoding_state() 
00096     : _M_in_desc(0), _M_out_desc(0), _M_ext_bom(0), _M_int_bom(0), _M_bytes(0)
00097     { }
00098 
00099     explicit 
00100     encoding_state(const char* __int, const char* __ext, 
00101            int __ibom = 0, int __ebom = 0, int __bytes = 1)
00102     : _M_int_enc(__int), _M_ext_enc(__ext), _M_in_desc(0), _M_out_desc(0), 
00103       _M_ext_bom(__ebom), _M_int_bom(__ibom), _M_bytes(__bytes)
00104     { init(); }
00105 
00106     // 21.1.2 traits typedefs
00107     // p4
00108     // typedef STATE_T state_type
00109     // requires: state_type shall meet the requirements of
00110     // CopyConstructible types (20.1.3)
00111     // NB: This does not preseve the actual state of the conversion
00112     // descriptor member, but it does duplicate the encoding
00113     // information.
00114     encoding_state(const encoding_state& __obj) : _M_in_desc(0), _M_out_desc(0)
00115     { construct(__obj); }
00116 
00117     // Need assignment operator as well.
00118     encoding_state&
00119     operator=(const encoding_state& __obj)
00120     {
00121       construct(__obj);
00122       return *this;
00123     }
00124 
00125     ~encoding_state()
00126     { destroy(); } 
00127 
00128     bool
00129     good() const throw()
00130     { 
00131       const descriptor_type __err = (iconv_t)(-1);
00132       bool __test = _M_in_desc && _M_in_desc != __err; 
00133       __test &=  _M_out_desc && _M_out_desc != __err;
00134       return __test;
00135     }
00136     
00137     int
00138     character_ratio() const
00139     { return _M_bytes; }
00140 
00141     const std::string
00142     internal_encoding() const
00143     { return _M_int_enc; }
00144 
00145     int 
00146     internal_bom() const
00147     { return _M_int_bom; }
00148 
00149     const std::string
00150     external_encoding() const
00151     { return _M_ext_enc; }
00152 
00153     int 
00154     external_bom() const
00155     { return _M_ext_bom; }
00156 
00157     const descriptor_type&
00158     in_descriptor() const
00159     { return _M_in_desc; }
00160 
00161     const descriptor_type&
00162     out_descriptor() const
00163     { return _M_out_desc; }
00164 
00165   protected:
00166     void
00167     init()
00168     {
00169       const descriptor_type __err = (iconv_t)(-1);
00170       const bool __have_encodings = _M_int_enc.size() && _M_ext_enc.size();
00171       if (!_M_in_desc && __have_encodings)
00172     {
00173       _M_in_desc = iconv_open(_M_int_enc.c_str(), _M_ext_enc.c_str());
00174       if (_M_in_desc == __err)
00175         std::__throw_runtime_error(__N("encoding_state::_M_init "
00176                     "creating iconv input descriptor failed"));
00177     }
00178       if (!_M_out_desc && __have_encodings)
00179     {
00180       _M_out_desc = iconv_open(_M_ext_enc.c_str(), _M_int_enc.c_str());
00181       if (_M_out_desc == __err)
00182         std::__throw_runtime_error(__N("encoding_state::_M_init "
00183                   "creating iconv output descriptor failed"));
00184     }
00185     }
00186 
00187     void
00188     construct(const encoding_state& __obj)
00189     {
00190       destroy();
00191       _M_int_enc = __obj._M_int_enc;
00192       _M_ext_enc = __obj._M_ext_enc;
00193       _M_ext_bom = __obj._M_ext_bom;
00194       _M_int_bom = __obj._M_int_bom;
00195       _M_bytes = __obj._M_bytes;
00196       init();
00197     }
00198 
00199     void
00200     destroy() throw()
00201     {
00202       const descriptor_type __err = (iconv_t)(-1);
00203       if (_M_in_desc && _M_in_desc != __err) 
00204     {
00205       iconv_close(_M_in_desc);
00206       _M_in_desc = 0;
00207     }
00208       if (_M_out_desc && _M_out_desc != __err) 
00209     {
00210       iconv_close(_M_out_desc);
00211       _M_out_desc = 0;
00212     }
00213     }
00214   };
00215 
00216   /// @brief  encoding_char_traits.
00217   // Custom traits type with encoding_state for the state type, and the
00218   // associated fpos<encoding_state> for the position type, all other
00219   // bits equivalent to the required char_traits instantiations.
00220   template<typename _CharT>
00221     struct encoding_char_traits : public std::char_traits<_CharT>
00222     {
00223       typedef encoding_state                state_type;
00224       typedef typename std::fpos<state_type>        pos_type;
00225     };
00226 
00227 _GLIBCXX_END_NAMESPACE
00228 
00229 
00230 _GLIBCXX_BEGIN_NAMESPACE(std)
00231 
00232   using __gnu_cxx::encoding_state;
00233 
00234   /// @brief  codecvt<InternT, _ExternT, encoding_state> specialization.
00235   // This partial specialization takes advantage of iconv to provide
00236   // code conversions between a large number of character encodings.
00237   template<typename _InternT, typename _ExternT>
00238     class codecvt<_InternT, _ExternT, encoding_state>
00239     : public __codecvt_abstract_base<_InternT, _ExternT, encoding_state>
00240     {
00241     public:      
00242       // Types:
00243       typedef codecvt_base::result          result;
00244       typedef _InternT                  intern_type;
00245       typedef _ExternT                  extern_type;
00246       typedef __gnu_cxx::encoding_state         state_type;
00247       typedef state_type::descriptor_type       descriptor_type;
00248 
00249       // Data Members:
00250       static locale::id         id;
00251 
00252       explicit 
00253       codecvt(size_t __refs = 0)
00254       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00255       { }
00256 
00257       explicit 
00258       codecvt(state_type& __enc, size_t __refs = 0)
00259       : __codecvt_abstract_base<intern_type, extern_type, state_type>(__refs)
00260       { }
00261 
00262      protected:
00263       virtual 
00264       ~codecvt() { }
00265 
00266       virtual result
00267       do_out(state_type& __state, const intern_type* __from, 
00268          const intern_type* __from_end, const intern_type*& __from_next,
00269          extern_type* __to, extern_type* __to_end,
00270          extern_type*& __to_next) const;
00271 
00272       virtual result
00273       do_unshift(state_type& __state, extern_type* __to, 
00274          extern_type* __to_end, extern_type*& __to_next) const;
00275 
00276       virtual result
00277       do_in(state_type& __state, const extern_type* __from, 
00278         const extern_type* __from_end, const extern_type*& __from_next,
00279         intern_type* __to, intern_type* __to_end, 
00280         intern_type*& __to_next) const;
00281 
00282       virtual int 
00283       do_encoding() const throw();
00284 
00285       virtual bool 
00286       do_always_noconv() const throw();
00287 
00288       virtual int 
00289       do_length(state_type&, const extern_type* __from, 
00290         const extern_type* __end, size_t __max) const;
00291 
00292       virtual int 
00293       do_max_length() const throw();
00294     };
00295 
00296   template<typename _InternT, typename _ExternT>
00297     locale::id 
00298     codecvt<_InternT, _ExternT, encoding_state>::id;
00299 
00300   // This adaptor works around the signature problems of the second
00301   // argument to iconv():  SUSv2 and others use 'const char**', but glibc 2.2
00302   // uses 'char**', which matches the POSIX 1003.1-2001 standard.
00303   // Using this adaptor, g++ will do the work for us.
00304   template<typename _Tp>
00305     inline size_t
00306     __iconv_adaptor(size_t(*__func)(iconv_t, _Tp, size_t*, char**, size_t*),
00307                     iconv_t __cd, char** __inbuf, size_t* __inbytes,
00308                     char** __outbuf, size_t* __outbytes)
00309     { return __func(__cd, (_Tp)__inbuf, __inbytes, __outbuf, __outbytes); }
00310 
00311   template<typename _InternT, typename _ExternT>
00312     codecvt_base::result
00313     codecvt<_InternT, _ExternT, encoding_state>::
00314     do_out(state_type& __state, const intern_type* __from, 
00315        const intern_type* __from_end, const intern_type*& __from_next,
00316        extern_type* __to, extern_type* __to_end,
00317        extern_type*& __to_next) const
00318     {
00319       result __ret = codecvt_base::error;
00320       if (__state.good())
00321     {
00322       const descriptor_type& __desc = __state.out_descriptor();
00323       const size_t __fmultiple = sizeof(intern_type);
00324       size_t __fbytes = __fmultiple * (__from_end - __from);
00325       const size_t __tmultiple = sizeof(extern_type);
00326       size_t __tbytes = __tmultiple * (__to_end - __to); 
00327       
00328       // Argument list for iconv specifies a byte sequence. Thus,
00329       // all to/from arrays must be brutally casted to char*.
00330       char* __cto = reinterpret_cast<char*>(__to);
00331       char* __cfrom;
00332       size_t __conv;
00333 
00334       // Some encodings need a byte order marker as the first item
00335       // in the byte stream, to designate endian-ness. The default
00336       // value for the byte order marker is NULL, so if this is
00337       // the case, it's not necessary and we can just go on our
00338       // merry way.
00339       int __int_bom = __state.internal_bom();
00340       if (__int_bom)
00341         {     
00342           size_t __size = __from_end - __from;
00343           intern_type* __cfixed = static_cast<intern_type*>
00344         (__builtin_alloca(sizeof(intern_type) * (__size + 1)));
00345           __cfixed[0] = static_cast<intern_type>(__int_bom);
00346           char_traits<intern_type>::copy(__cfixed + 1, __from, __size);
00347           __cfrom = reinterpret_cast<char*>(__cfixed);
00348           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00349                                         &__fbytes, &__cto, &__tbytes); 
00350         }
00351       else
00352         {
00353           intern_type* __cfixed = const_cast<intern_type*>(__from);
00354           __cfrom = reinterpret_cast<char*>(__cfixed);
00355           __conv = __iconv_adaptor(iconv, __desc, &__cfrom, &__fbytes, 
00356                        &__cto, &__tbytes); 
00357         }
00358 
00359       if (__conv != size_t(-1))
00360         {
00361           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00362           __to_next = reinterpret_cast<extern_type*>(__cto);
00363           __ret = codecvt_base::ok;
00364         }
00365       else 
00366         {
00367           if (__fbytes < __fmultiple * (__from_end - __from))
00368         {
00369           __from_next = reinterpret_cast<const intern_type*>(__cfrom);
00370           __to_next = reinterpret_cast<extern_type*>(__cto);
00371           __ret = codecvt_base::partial;
00372         }
00373           else
00374         __ret = codecvt_base::error;
00375         }
00376     }
00377       return __ret; 
00378     }
00379 
00380   template<typename _InternT, typename _ExternT>
00381     codecvt_base::result
00382     codecvt<_InternT, _ExternT, encoding_state>::
00383     do_unshift(state_type& __state, extern_type* __to, 
00384            extern_type* __to_end, extern_type*& __to_next) const
00385     {
00386       result __ret = codecvt_base::error;
00387       if (__state.good())
00388     {
00389       const descriptor_type& __desc = __state.in_descriptor();
00390       const size_t __tmultiple = sizeof(intern_type);
00391       size_t __tlen = __tmultiple * (__to_end - __to); 
00392       
00393       // Argument list for iconv specifies a byte sequence. Thus,
00394       // all to/from arrays must be brutally casted to char*.
00395       char* __cto = reinterpret_cast<char*>(__to);
00396       size_t __conv = __iconv_adaptor(iconv,__desc, NULL, NULL,
00397                                           &__cto, &__tlen); 
00398       
00399       if (__conv != size_t(-1))
00400         {
00401           __to_next = reinterpret_cast<extern_type*>(__cto);
00402           if (__tlen == __tmultiple * (__to_end - __to))
00403         __ret = codecvt_base::noconv;
00404           else if (__tlen == 0)
00405         __ret = codecvt_base::ok;
00406           else
00407         __ret = codecvt_base::partial;
00408         }
00409       else 
00410         __ret = codecvt_base::error;
00411     }
00412       return __ret; 
00413     }
00414    
00415   template<typename _InternT, typename _ExternT>
00416     codecvt_base::result
00417     codecvt<_InternT, _ExternT, encoding_state>::
00418     do_in(state_type& __state, const extern_type* __from, 
00419       const extern_type* __from_end, const extern_type*& __from_next,
00420       intern_type* __to, intern_type* __to_end, 
00421       intern_type*& __to_next) const
00422     { 
00423       result __ret = codecvt_base::error;
00424       if (__state.good())
00425     {
00426       const descriptor_type& __desc = __state.in_descriptor();
00427       const size_t __fmultiple = sizeof(extern_type);
00428       size_t __flen = __fmultiple * (__from_end - __from);
00429       const size_t __tmultiple = sizeof(intern_type);
00430       size_t __tlen = __tmultiple * (__to_end - __to); 
00431       
00432       // Argument list for iconv specifies a byte sequence. Thus,
00433       // all to/from arrays must be brutally casted to char*.
00434       char* __cto = reinterpret_cast<char*>(__to);
00435       char* __cfrom;
00436       size_t __conv;
00437 
00438       // Some encodings need a byte order marker as the first item
00439       // in the byte stream, to designate endian-ness. The default
00440       // value for the byte order marker is NULL, so if this is
00441       // the case, it's not necessary and we can just go on our
00442       // merry way.
00443       int __ext_bom = __state.external_bom();
00444       if (__ext_bom)
00445         {     
00446           size_t __size = __from_end - __from;
00447           extern_type* __cfixed =  static_cast<extern_type*>
00448         (__builtin_alloca(sizeof(extern_type) * (__size + 1)));
00449           __cfixed[0] = static_cast<extern_type>(__ext_bom);
00450           char_traits<extern_type>::copy(__cfixed + 1, __from, __size);
00451           __cfrom = reinterpret_cast<char*>(__cfixed);
00452           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00453                                        &__flen, &__cto, &__tlen); 
00454         }
00455       else
00456         {
00457           extern_type* __cfixed = const_cast<extern_type*>(__from);
00458           __cfrom = reinterpret_cast<char*>(__cfixed);
00459           __conv = __iconv_adaptor(iconv, __desc, &__cfrom,
00460                                        &__flen, &__cto, &__tlen); 
00461         }
00462 
00463       
00464       if (__conv != size_t(-1))
00465         {
00466           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00467           __to_next = reinterpret_cast<intern_type*>(__cto);
00468           __ret = codecvt_base::ok;
00469         }
00470       else 
00471         {
00472           if (__flen < static_cast<size_t>(__from_end - __from))
00473         {
00474           __from_next = reinterpret_cast<const extern_type*>(__cfrom);
00475           __to_next = reinterpret_cast<intern_type*>(__cto);
00476           __ret = codecvt_base::partial;
00477         }
00478           else
00479         __ret = codecvt_base::error;
00480         }
00481     }
00482       return __ret; 
00483     }
00484   
00485   template<typename _InternT, typename _ExternT>
00486     int 
00487     codecvt<_InternT, _ExternT, encoding_state>::
00488     do_encoding() const throw()
00489     {
00490       int __ret = 0;
00491       if (sizeof(_ExternT) <= sizeof(_InternT))
00492     __ret = sizeof(_InternT) / sizeof(_ExternT);
00493       return __ret; 
00494     }
00495   
00496   template<typename _InternT, typename _ExternT>
00497     bool 
00498     codecvt<_InternT, _ExternT, encoding_state>::
00499     do_always_noconv() const throw()
00500     { return false; }
00501   
00502   template<typename _InternT, typename _ExternT>
00503     int 
00504     codecvt<_InternT, _ExternT, encoding_state>::
00505     do_length(state_type&, const extern_type* __from, 
00506           const extern_type* __end, size_t __max) const
00507     { return std::min(__max, static_cast<size_t>(__end - __from)); }
00508 
00509   // _GLIBCXX_RESOLVE_LIB_DEFECTS
00510   // 74.  Garbled text for codecvt::do_max_length
00511   template<typename _InternT, typename _ExternT>
00512     int 
00513     codecvt<_InternT, _ExternT, encoding_state>::
00514     do_max_length() const throw()
00515     { return 1; }
00516 
00517 _GLIBCXX_END_NAMESPACE
00518 
00519 #endif
00520 
00521 #endif

Generated on Thu Nov 1 13:11:23 2007 for libstdc++ by  doxygen 1.5.1