misc/string.cpp Source File

00001 #include "string.h"
00002 
00016 #include <cassert>
00017 #include <cstdlib>
00018 #include <iostream>
00019 #include <iomanip>
00020 using namespace std;
00021 
00022 bool isBinary(const std::string  &str)
00023 {
00024     const char *begin = str.c_str();
00025     const char *end   = begin + str.size();
00026 
00027     for (const char *i=begin; i!=end; i++)
00028     {
00029         char c = *i;
00030         if ((c<32 || c>126) && c!='\t' && c!='\r' && c!='\n')
00031             return true;
00032     }
00033 
00034     return false;
00035 }
00036 
00037 // Note, would be faster perhaps to use char *
00038 // instead of integer index
00039 
00040 void dos2unix(string &dest,const string &src)
00041 {
00042     // Find the size of the destination string
00043 
00044     string::size_type size = src.size();
00045     for (string::size_type i=0;i<src.size();i++)
00046         if (src[i]==13)
00047             size--;
00048 
00049     // Bail out early, if possible
00050 
00051     if (size==src.size())
00052     {
00053         dest = src;
00054         return;
00055     }
00056 
00057     // Allocate the correct size for destination
00058 
00059     dest.resize(size);
00060 
00061     // Copy everything except ASCII 13
00062 
00063     string::size_type k = 0;
00064     for (string::size_type j=0;j<src.size();j++)
00065         if (src[j]!=13)
00066             dest[k++] = src[j];
00067 
00068     assert(k==size);
00069 }
00070 
00071 void dos2unix(wstring &dest,const wstring &src)
00072 {
00073     // Remove all instances of carriage return (ASCII 13)
00074 
00075     // Find the size of the destination string
00076 
00077     wstring::size_type size = src.size();
00078     for (wstring::size_type i=0;i<src.size();i++)
00079         if (src[i]==13)
00080             size--;
00081 
00082     // Bail out early, if possible
00083 
00084     if (size==src.size())
00085     {
00086         dest = src;
00087         return;
00088     }
00089 
00090     // Allocate the correct size for destination
00091 
00092     dest.resize(size);
00093 
00094     // Copy everything except the linefeeds
00095 
00096     wstring::size_type k = 0;
00097     for (wstring::size_type j=0;j<src.size();j++)
00098         if (src[j]!=13)
00099             dest[k++] = src[j];
00100 
00101     assert(k==size);
00102 }
00103 
00104 void unix2dos(string &dest,const string &src)
00105 {
00106     // Insert carriage returns, where necessary (ASCII 13)
00107 
00108     // Find the size of the destination string
00109 
00110     string::size_type size = src.size();
00111 
00112     // Take first character into account
00113 
00114     if (src.size()>0)
00115         if (src[0]==10)
00116             size++;
00117 
00118     for (string::size_type i=0;i<src.size()-1;i++)
00119         if (src[i]!=13 && src[i+1]==10)
00120             size++;
00121 
00122     // Allocate the correct size for destination
00123 
00124     dest.resize(size);
00125 
00126     // Copy everything, inserting linefeeds where necessary
00127 
00128     string::size_type j = 0;
00129     string::size_type k = 0;
00130 
00131     if (src.size()>0)
00132         if (src[0]==10)
00133             dest[k++] = 13;
00134 
00135     for (;j<src.size()-1;j++)
00136         if (src[j]!=13 && src[j+1]==10)
00137         {
00138             dest[k++] = src[j];
00139             dest[k++] = 13;
00140         }
00141         else
00142             dest[k++] = src[j];
00143 
00144     if (j<src.size())
00145         dest[k++] = src[j++];
00146 
00147     assert(k==size);
00148 }
00149 
00150 void unix2dos(wstring &dest,const wstring &src)
00151 {
00152     // Insert carriage returns, where necessary (ASCII 13)
00153 
00154     // Find the size of the destination string
00155 
00156     wstring::size_type size = src.size();
00157 
00158     // Take first character into account
00159 
00160     if (src.size()>0)
00161         if (src[0]==10)
00162             size++;
00163 
00164     for (wstring::size_type i=0;i<src.size()-1;i++)
00165         if (src[i]!=13 && src[i+1]==10)
00166             size++;
00167 
00168     // Allocate the correct size for destination
00169 
00170     dest.resize(size);
00171 
00172     // Copy everything, inserting linefeeds where necessary
00173 
00174     wstring::size_type j = 0;
00175     wstring::size_type k = 0;
00176 
00177     if (src.size()>0)
00178         if (src[0]==10)
00179             dest[k++] = 13;
00180 
00181     for (;j<src.size()-1;j++)
00182         if (src[j]!=13 && src[j+1]==10)
00183         {
00184             dest[k++] = src[j];
00185             dest[k++] = 13;
00186         }
00187         else
00188             dest[k++] = src[j];
00189 
00190     if (j<src.size())
00191         dest[k++] = src[j++];
00192 
00193     assert(k==size);
00194 }
00195 
00196 void readStream(istream &is,string &dest)
00197 {
00198     while (is.good() && !is.eof())
00199     {
00200         const int bufferSize = 10240;
00201         char buffer[bufferSize];
00202         is.read(buffer,bufferSize);
00203         const int size = is.gcount();
00204         if (size>0)
00205             dest.insert(dest.end(),buffer,buffer+size);
00206     }
00207 }
00208 
00209 void writeStream(ostream &os,const string &src)
00210 {
00211     os.write(src.c_str(),src.length());
00212 }
00213 
00214 // 
00215 // Read a file into a Unicode string
00216 //
00217 // This function assumes that the file is
00218 // in "normal" Unicode format, as a sequence
00219 // of 16 bit codes.  The BOM (Byte Order
00220 // Marker) is used to detect Unicode, and
00221 // to swap endianess, if necessary. 
00222 //
00223 
00224 const wchar_t BOM  = 0xFEFF;
00225 const wchar_t BOMe = 0xFFFE;    // Handle opposite endian
00226 
00227 void readUnicodeStream(istream &is,wstring &dest)
00228 {
00229     const int  bufferSize=1024;     // Buffer Size
00230     wchar_t    buffer[bufferSize];  // Buffer
00231     bool       firstBlock = true;   // Check header of first block
00232     bool       swap = false;        // Opposite Endian Origin
00233 
00234     dest = wstring();
00235 
00236     // As long as input stream is good
00237 
00238     while (is.good())
00239     {
00240         // Read into buffer and find out
00241         // how many bytes were read.
00242 
00243         is.read((char *) buffer,bufferSize*sizeof(wchar_t));
00244         int count = is.gcount()/sizeof(wchar_t);
00245 
00246         // If we read something, lets
00247         // do some processing.
00248 
00249         if (count)
00250         {
00251             // Check the first 16 bits
00252             // of first block for BOM
00253             // marker.  If it's in the
00254             // wrong order, enable swapping
00255 
00256             if (firstBlock)
00257             {
00258                 if (buffer[0]!=BOM && buffer[0]!=BOMe)
00259                     return;
00260 
00261                 if (buffer[0]==BOMe)
00262                     swap = true;
00263 
00264                 firstBlock = false;
00265             }
00266 
00267             // If we're in swap mode,
00268             // swap high and low bytes
00269             // of each code
00270 
00271             if (swap)
00272                 for (int c=0; c<count; c++)
00273                     buffer[c] = (buffer[c]<<8)|(buffer[c]>>8);
00274 
00275             // Add the buffer to the 
00276             // Unicode string, ignoring
00277             // the BOM, if it exists as
00278             // the first element in the
00279             // buffer
00280 
00281             if (buffer[0]==BOM)
00282                 dest.append(buffer+1,count-1);
00283             else
00284                 dest.append(buffer,count);
00285         }
00286     }
00287 }
00288 
00289 //
00290 // Write a unicode string to a file
00291 //
00292 
00293 void writeUnicodeStream(ostream &os,const wstring &src)
00294 {
00295     os.write(reinterpret_cast<const char *>(&BOM),sizeof(wchar_t));
00296     os.write(reinterpret_cast<const char *>(src.data()),src.size()*sizeof(wchar_t));
00297 }
00298 
00299 void string2wstring(wstring &dest,const string &src)
00300 {
00301     dest.resize(src.size());
00302     for (uint32 i=0; i<src.size(); i++)
00303         dest[i] = static_cast<unsigned char>(src[i]);
00304 }
00305 
00306 void wstring2string(string &dest,const wstring &src)
00307 {
00308     dest.resize(src.size());
00309     for (uint32 i=0; i<src.size(); i++)
00310         dest[i] = src[i] < 256 ? src[i] : ' ';
00311 }
00312 
00313 //
00314 
00315 // http://www.cl.cam.ac.uk/~mgk25/unicode.html#utf-8
00316 //
00317 // TODO: Support U-10000 onwards
00318 
00319 void utf8decode(wstring &dest, const string &src)
00320 {
00321     int i = 0;
00322     unsigned char *s = (unsigned char *) src.c_str();
00323 
00324     while (i<src.size())
00325     {
00326         const wchar_t c = s[i++];
00327 
00328         // U-0 to U-7F 
00329 
00330         if ((c&0x80) == 0x00)
00331         {
00332             dest += c;
00333             continue;
00334         }
00335 
00336         // U-80 to U-7FF
00337 
00338         if ((c&0xE0) == 0xC0)
00339         {
00340             if (i<src.size())
00341             {
00342                 const wchar_t d = s[i++];
00343                 dest += (c&0x1f)<<6 | (d&0x3f);
00344                 continue;
00345             }
00346         }
00347 
00348         // U-800 to U-FFFF
00349 
00350         if ((c&0xF0) == 0xE0)
00351         {
00352             if (i+1<src.size())
00353             {
00354                 const wchar_t d = s[i++];
00355                 const wchar_t e = s[i++];
00356                 dest += (c&0x0f)<<12 | (d&0x3f)<<6 | (e&0x3f);
00357                 continue;
00358             }
00359         }
00360     }
00361 }
00362 
00363 //
00364 
00365 void bin2src_(std::ostream &os,bool &begin,const unsigned char *buffer,const int n)
00366 {
00367     os.setf(ios::hex,ios::basefield);
00368 
00369     if (n>0 && !begin)
00370     {
00371         os << ',';
00372         os << endl;
00373     }
00374 
00375     begin = false;
00376 
00377     for (int i=0; i<n;i++)
00378     {
00379         os << "0x" << setw(2) << setfill('0') << (unsigned int) buffer[i];
00380         if (i<n-1)
00381             os << ',';
00382     }
00383 }
00384 
00385 void bin2src(std::ostream &os,const unsigned char *buffer,const int n)
00386 {
00387     os << '{' << endl;
00388     
00389     bool begin = true;
00390 
00391     for (int i=0; i<n; i+=16)
00392         if (n-i>16)
00393             bin2src_(os,begin,buffer+i,16);
00394         else
00395             bin2src_(os,begin,buffer+i,n-i);
00396 
00397     os << endl << "};" << endl;
00398 }
00399 
00400 void bin2src(std::ostream &os, const std::string &src)
00401 {
00402     bin2src(os,(const unsigned char *) src.c_str(),src.length());
00403 }
00404 
00405 void bin2src(std::ostream &os, std::istream &is)
00406 {
00407     os << '{' << endl;
00408     
00409     bool begin = true;
00410 
00411     while (is.good() && !is.eof())
00412     {
00413         unsigned char buffer[16];
00414         is.read((char *) buffer,16);
00415         int size = is.gcount();
00416         bin2src_(os,begin,buffer,size);
00417     }
00418 
00419     os << endl << "};" << endl;
00420 }
00421 
00422 //
00423 
00424 void bin2asm_(std::ostream &os,const unsigned char *buffer,const int n)
00425 {
00426     if (n<=0)
00427         return;
00428 
00429     os.setf(ios::hex,ios::basefield);
00430 
00431     os << "\t.byte ";
00432 
00433     for (int i=0; i<n;i++)
00434     {
00435         os << "0x" << setw(2) << setfill('0') << (unsigned int) buffer[i];
00436         if (i<n-1)
00437             os << ',';
00438     }
00439 
00440     os << endl;
00441 }
00442 
00443 void bin2asm(std::ostream &os,const unsigned char *buffer,const int n)
00444 {
00445     for (int i=0; i<n; i+=16)
00446         if (n-i>16)
00447             bin2asm_(os,buffer+i,16);
00448         else
00449             bin2asm_(os,buffer+i,n-i);
00450 }
00451 
00452 void bin2asm(std::ostream &os, const std::string &src)
00453 {
00454     bin2asm(os,(const unsigned char *) src.c_str(),src.length());
00455 }
00456 
00457 void bin2asm(std::ostream &os, std::istream &is)
00458 {
00459     while (is.good() && !is.eof())
00460     {
00461         unsigned char buffer[16];
00462         is.read((char *) buffer,16);
00463         int size = is.gcount();
00464         bin2asm_(os,buffer,size);
00465     }
00466 }
00467 
00468 unsigned int fromHex4(unsigned char ch)
00469 {
00470     if (ch>='0' && ch<='9')
00471         return ch-'0';
00472 
00473     if (ch>='a' && ch<='f')
00474         return ch-'a'+10;
00475 
00476     if (ch>='A' && ch<='F')
00477         return ch-'A'+10;
00478 
00479     return 0;
00480 }
00481 
00482 unsigned char toHex4(unsigned int val)
00483 {
00484     const unsigned char table[16] = { 
00485         '0', '1', '2', '3', '4', 
00486         '5', '6', '7', '8', '9', 
00487         'A', 'B', 'C', 'D', 'E', 'F'
00488     };
00489 
00490     return table[val&15];
00491 }
00492 
00493 bool stringSplit(vector<string> &vec,const string &str,const string &delim)
00494 {
00495     vec.clear();
00496 
00497     if (delim.empty())
00498     {
00499         vec.push_back(str);
00500         return false;
00501     }
00502 
00503     string::size_type i = 0;
00504     string::size_type j = 0;
00505 
00506     for (;;)
00507     {
00508         j = str.find(delim,i);
00509         if (j==string::npos)
00510         {
00511             vec.push_back(str.substr(i));
00512             break;
00513         }
00514 
00515         vec.push_back(str.substr(i,j-i));
00516         i = j + delim.size();
00517 
00518         if (i==str.size())
00519         {
00520             vec.push_back(string());
00521             break;
00522         }
00523     }
00524 
00525     return true;
00526 }
00527 
00528 bool stringMerge(const vector<string> &vec, string &str,const string &delim)
00529 {
00530     str = string();
00531 
00532     for (int i=0; i<vec.size(); i++)
00533     {
00534         if (i>0)
00535             str += delim;
00536 
00537         str += vec[i];
00538     }
00539 
00540     return true;
00541 }
00542 
00543 double atof(const std::string &str) { return atof(str.c_str());                   }
00544 int    atoi(const std::string &str) { return atoi(str.c_str());                   }
00545 long   atol(const std::string &str) { return atol(str.c_str());                   }
00546 bool   atob(const std::string &str) { return atoi(str.c_str())!=0 || str.substr(0,4)=="true"; }