|
||||
File indexing completed on 2025-01-29 10:18:02
0001 // © 2016 and later: Unicode, Inc. and others. 0002 // License & terms of use: http://www.unicode.org/copyright.html 0003 /* 0004 ********************************************************************** 0005 * Copyright (C) 2005-2013, International Business Machines 0006 * Corporation and others. All Rights Reserved. 0007 ********************************************************************** 0008 * file name: ucsdet.h 0009 * encoding: UTF-8 0010 * indentation:4 0011 * 0012 * created on: 2005Aug04 0013 * created by: Andy Heninger 0014 * 0015 * ICU Character Set Detection, API for C 0016 * 0017 * Draft version 18 Oct 2005 0018 * 0019 */ 0020 0021 #ifndef __UCSDET_H 0022 #define __UCSDET_H 0023 0024 #include "unicode/utypes.h" 0025 0026 #if !UCONFIG_NO_CONVERSION 0027 0028 #include "unicode/uenum.h" 0029 0030 #if U_SHOW_CPLUSPLUS_API 0031 #include "unicode/localpointer.h" 0032 #endif // U_SHOW_CPLUSPLUS_API 0033 0034 /** 0035 * \file 0036 * \brief C API: Charset Detection API 0037 * 0038 * This API provides a facility for detecting the 0039 * charset or encoding of character data in an unknown text format. 0040 * The input data can be from an array of bytes. 0041 * <p> 0042 * Character set detection is at best an imprecise operation. The detection 0043 * process will attempt to identify the charset that best matches the characteristics 0044 * of the byte data, but the process is partly statistical in nature, and 0045 * the results can not be guaranteed to always be correct. 0046 * <p> 0047 * For best accuracy in charset detection, the input data should be primarily 0048 * in a single language, and a minimum of a few hundred bytes worth of plain text 0049 * in the language are needed. The detection process will attempt to 0050 * ignore html or xml style markup that could otherwise obscure the content. 0051 * <p> 0052 * An alternative to the ICU Charset Detector is the 0053 * Compact Encoding Detector, https://github.com/google/compact_enc_det. 0054 * It often gives more accurate results, especially with short input samples. 0055 */ 0056 0057 0058 struct UCharsetDetector; 0059 /** 0060 * Structure representing a charset detector 0061 * @stable ICU 3.6 0062 */ 0063 typedef struct UCharsetDetector UCharsetDetector; 0064 0065 struct UCharsetMatch; 0066 /** 0067 * Opaque structure representing a match that was identified 0068 * from a charset detection operation. 0069 * @stable ICU 3.6 0070 */ 0071 typedef struct UCharsetMatch UCharsetMatch; 0072 0073 /** 0074 * Open a charset detector. 0075 * 0076 * @param status Any error conditions occurring during the open 0077 * operation are reported back in this variable. 0078 * @return the newly opened charset detector. 0079 * @stable ICU 3.6 0080 */ 0081 U_CAPI UCharsetDetector * U_EXPORT2 0082 ucsdet_open(UErrorCode *status); 0083 0084 /** 0085 * Close a charset detector. All storage and any other resources 0086 * owned by this charset detector will be released. Failure to 0087 * close a charset detector when finished with it can result in 0088 * memory leaks in the application. 0089 * 0090 * @param ucsd The charset detector to be closed. 0091 * @stable ICU 3.6 0092 */ 0093 U_CAPI void U_EXPORT2 0094 ucsdet_close(UCharsetDetector *ucsd); 0095 0096 #if U_SHOW_CPLUSPLUS_API 0097 0098 U_NAMESPACE_BEGIN 0099 0100 /** 0101 * \class LocalUCharsetDetectorPointer 0102 * "Smart pointer" class, closes a UCharsetDetector via ucsdet_close(). 0103 * For most methods see the LocalPointerBase base class. 0104 * 0105 * @see LocalPointerBase 0106 * @see LocalPointer 0107 * @stable ICU 4.4 0108 */ 0109 U_DEFINE_LOCAL_OPEN_POINTER(LocalUCharsetDetectorPointer, UCharsetDetector, ucsdet_close); 0110 0111 U_NAMESPACE_END 0112 0113 #endif 0114 0115 /** 0116 * Set the input byte data whose charset is to detected. 0117 * 0118 * Ownership of the input text byte array remains with the caller. 0119 * The input string must not be altered or deleted until the charset 0120 * detector is either closed or reset to refer to different input text. 0121 * 0122 * @param ucsd the charset detector to be used. 0123 * @param textIn the input text of unknown encoding. . 0124 * @param len the length of the input text, or -1 if the text 0125 * is NUL terminated. 0126 * @param status any error conditions are reported back in this variable. 0127 * 0128 * @stable ICU 3.6 0129 */ 0130 U_CAPI void U_EXPORT2 0131 ucsdet_setText(UCharsetDetector *ucsd, const char *textIn, int32_t len, UErrorCode *status); 0132 0133 0134 /** Set the declared encoding for charset detection. 0135 * The declared encoding of an input text is an encoding obtained 0136 * by the user from an http header or xml declaration or similar source that 0137 * can be provided as an additional hint to the charset detector. 0138 * 0139 * How and whether the declared encoding will be used during the 0140 * detection process is TBD. 0141 * 0142 * @param ucsd the charset detector to be used. 0143 * @param encoding an encoding for the current data obtained from 0144 * a header or declaration or other source outside 0145 * of the byte data itself. 0146 * @param length the length of the encoding name, or -1 if the name string 0147 * is NUL terminated. 0148 * @param status any error conditions are reported back in this variable. 0149 * 0150 * @stable ICU 3.6 0151 */ 0152 U_CAPI void U_EXPORT2 0153 ucsdet_setDeclaredEncoding(UCharsetDetector *ucsd, const char *encoding, int32_t length, UErrorCode *status); 0154 0155 0156 /** 0157 * Return the charset that best matches the supplied input data. 0158 * 0159 * Note though, that because the detection 0160 * only looks at the start of the input data, 0161 * there is a possibility that the returned charset will fail to handle 0162 * the full set of input data. 0163 * <p> 0164 * The returned UCharsetMatch object is owned by the UCharsetDetector. 0165 * It will remain valid until the detector input is reset, or until 0166 * the detector is closed. 0167 * <p> 0168 * The function will fail if 0169 * <ul> 0170 * <li>no charset appears to match the data.</li> 0171 * <li>no input text has been provided</li> 0172 * </ul> 0173 * 0174 * @param ucsd the charset detector to be used. 0175 * @param status any error conditions are reported back in this variable. 0176 * @return a UCharsetMatch representing the best matching charset, 0177 * or NULL if no charset matches the byte data. 0178 * 0179 * @stable ICU 3.6 0180 */ 0181 U_CAPI const UCharsetMatch * U_EXPORT2 0182 ucsdet_detect(UCharsetDetector *ucsd, UErrorCode *status); 0183 0184 0185 /** 0186 * Find all charset matches that appear to be consistent with the input, 0187 * returning an array of results. The results are ordered with the 0188 * best quality match first. 0189 * 0190 * Because the detection only looks at a limited amount of the 0191 * input byte data, some of the returned charsets may fail to handle 0192 * the all of input data. 0193 * <p> 0194 * The returned UCharsetMatch objects are owned by the UCharsetDetector. 0195 * They will remain valid until the detector is closed or modified 0196 * 0197 * <p> 0198 * Return an error if 0199 * <ul> 0200 * <li>no charsets appear to match the input data.</li> 0201 * <li>no input text has been provided</li> 0202 * </ul> 0203 * 0204 * @param ucsd the charset detector to be used. 0205 * @param matchesFound pointer to a variable that will be set to the 0206 * number of charsets identified that are consistent with 0207 * the input data. Output only. 0208 * @param status any error conditions are reported back in this variable. 0209 * @return A pointer to an array of pointers to UCharSetMatch objects. 0210 * This array, and the UCharSetMatch instances to which it refers, 0211 * are owned by the UCharsetDetector, and will remain valid until 0212 * the detector is closed or modified. 0213 * @stable ICU 3.6 0214 */ 0215 U_CAPI const UCharsetMatch ** U_EXPORT2 0216 ucsdet_detectAll(UCharsetDetector *ucsd, int32_t *matchesFound, UErrorCode *status); 0217 0218 0219 0220 /** 0221 * Get the name of the charset represented by a UCharsetMatch. 0222 * 0223 * The storage for the returned name string is owned by the 0224 * UCharsetMatch, and will remain valid while the UCharsetMatch 0225 * is valid. 0226 * 0227 * The name returned is suitable for use with the ICU conversion APIs. 0228 * 0229 * @param ucsm The charset match object. 0230 * @param status Any error conditions are reported back in this variable. 0231 * @return The name of the matching charset. 0232 * 0233 * @stable ICU 3.6 0234 */ 0235 U_CAPI const char * U_EXPORT2 0236 ucsdet_getName(const UCharsetMatch *ucsm, UErrorCode *status); 0237 0238 /** 0239 * Get a confidence number for the quality of the match of the byte 0240 * data with the charset. Confidence numbers range from zero to 100, 0241 * with 100 representing complete confidence and zero representing 0242 * no confidence. 0243 * 0244 * The confidence values are somewhat arbitrary. They define an 0245 * an ordering within the results for any single detection operation 0246 * but are not generally comparable between the results for different input. 0247 * 0248 * A confidence value of ten does have a general meaning - it is used 0249 * for charsets that can represent the input data, but for which there 0250 * is no other indication that suggests that the charset is the correct one. 0251 * Pure 7 bit ASCII data, for example, is compatible with a 0252 * great many charsets, most of which will appear as possible matches 0253 * with a confidence of 10. 0254 * 0255 * @param ucsm The charset match object. 0256 * @param status Any error conditions are reported back in this variable. 0257 * @return A confidence number for the charset match. 0258 * 0259 * @stable ICU 3.6 0260 */ 0261 U_CAPI int32_t U_EXPORT2 0262 ucsdet_getConfidence(const UCharsetMatch *ucsm, UErrorCode *status); 0263 0264 /** 0265 * Get the RFC 3066 code for the language of the input data. 0266 * 0267 * The Charset Detection service is intended primarily for detecting 0268 * charsets, not language. For some, but not all, charsets, a language is 0269 * identified as a byproduct of the detection process, and that is what 0270 * is returned by this function. 0271 * 0272 * CAUTION: 0273 * 1. Language information is not available for input data encoded in 0274 * all charsets. In particular, no language is identified 0275 * for UTF-8 input data. 0276 * 0277 * 2. Closely related languages may sometimes be confused. 0278 * 0279 * If more accurate language detection is required, a linguistic 0280 * analysis package should be used. 0281 * 0282 * The storage for the returned name string is owned by the 0283 * UCharsetMatch, and will remain valid while the UCharsetMatch 0284 * is valid. 0285 * 0286 * @param ucsm The charset match object. 0287 * @param status Any error conditions are reported back in this variable. 0288 * @return The RFC 3066 code for the language of the input data, or 0289 * an empty string if the language could not be determined. 0290 * 0291 * @stable ICU 3.6 0292 */ 0293 U_CAPI const char * U_EXPORT2 0294 ucsdet_getLanguage(const UCharsetMatch *ucsm, UErrorCode *status); 0295 0296 0297 /** 0298 * Get the entire input text as a UChar string, placing it into 0299 * a caller-supplied buffer. A terminating 0300 * NUL character will be appended to the buffer if space is available. 0301 * 0302 * The number of UChars in the output string, not including the terminating 0303 * NUL, is returned. 0304 * 0305 * If the supplied buffer is smaller than required to hold the output, 0306 * the contents of the buffer are undefined. The full output string length 0307 * (in UChars) is returned as always, and can be used to allocate a buffer 0308 * of the correct size. 0309 * 0310 * 0311 * @param ucsm The charset match object. 0312 * @param buf A UChar buffer to be filled with the converted text data. 0313 * @param cap The capacity of the buffer in UChars. 0314 * @param status Any error conditions are reported back in this variable. 0315 * @return The number of UChars in the output string. 0316 * 0317 * @stable ICU 3.6 0318 */ 0319 U_CAPI int32_t U_EXPORT2 0320 ucsdet_getUChars(const UCharsetMatch *ucsm, 0321 UChar *buf, int32_t cap, UErrorCode *status); 0322 0323 0324 0325 /** 0326 * Get an iterator over the set of all detectable charsets - 0327 * over the charsets that are known to the charset detection 0328 * service. 0329 * 0330 * The returned UEnumeration provides access to the names of 0331 * the charsets. 0332 * 0333 * <p> 0334 * The state of the Charset detector that is passed in does not 0335 * affect the result of this function, but requiring a valid, open 0336 * charset detector as a parameter insures that the charset detection 0337 * service has been safely initialized and that the required detection 0338 * data is available. 0339 * 0340 * <p> 0341 * <b>Note:</b> Multiple different charset encodings in a same family may use 0342 * a single shared name in this implementation. For example, this method returns 0343 * an array including "ISO-8859-1" (ISO Latin 1), but not including "windows-1252" 0344 * (Windows Latin 1). However, actual detection result could be "windows-1252" 0345 * when the input data matches Latin 1 code points with any points only available 0346 * in "windows-1252". 0347 * 0348 * @param ucsd a Charset detector. 0349 * @param status Any error conditions are reported back in this variable. 0350 * @return an iterator providing access to the detectable charset names. 0351 * @stable ICU 3.6 0352 */ 0353 U_CAPI UEnumeration * U_EXPORT2 0354 ucsdet_getAllDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 0355 0356 /** 0357 * Test whether input filtering is enabled for this charset detector. 0358 * Input filtering removes text that appears to be HTML or xml 0359 * markup from the input before applying the code page detection 0360 * heuristics. 0361 * 0362 * @param ucsd The charset detector to check. 0363 * @return true if filtering is enabled. 0364 * @stable ICU 3.6 0365 */ 0366 0367 U_CAPI UBool U_EXPORT2 0368 ucsdet_isInputFilterEnabled(const UCharsetDetector *ucsd); 0369 0370 0371 /** 0372 * Enable filtering of input text. If filtering is enabled, 0373 * text within angle brackets ("<" and ">") will be removed 0374 * before detection, which will remove most HTML or xml markup. 0375 * 0376 * @param ucsd the charset detector to be modified. 0377 * @param filter <code>true</code> to enable input text filtering. 0378 * @return The previous setting. 0379 * 0380 * @stable ICU 3.6 0381 */ 0382 U_CAPI UBool U_EXPORT2 0383 ucsdet_enableInputFilter(UCharsetDetector *ucsd, UBool filter); 0384 0385 #ifndef U_HIDE_INTERNAL_API 0386 /** 0387 * Get an iterator over the set of detectable charsets - 0388 * over the charsets that are enabled by the specified charset detector. 0389 * 0390 * The returned UEnumeration provides access to the names of 0391 * the charsets. 0392 * 0393 * @param ucsd a Charset detector. 0394 * @param status Any error conditions are reported back in this variable. 0395 * @return an iterator providing access to the detectable charset names by 0396 * the specified charset detector. 0397 * @internal 0398 */ 0399 U_CAPI UEnumeration * U_EXPORT2 0400 ucsdet_getDetectableCharsets(const UCharsetDetector *ucsd, UErrorCode *status); 0401 0402 /** 0403 * Enable or disable individual charset encoding. 0404 * A name of charset encoding must be included in the names returned by 0405 * {@link #ucsdet_getAllDetectableCharsets()}. 0406 * 0407 * @param ucsd a Charset detector. 0408 * @param encoding encoding the name of charset encoding. 0409 * @param enabled <code>true</code> to enable, or <code>false</code> to disable the 0410 * charset encoding. 0411 * @param status receives the return status. When the name of charset encoding 0412 * is not supported, U_ILLEGAL_ARGUMENT_ERROR is set. 0413 * @internal 0414 */ 0415 U_CAPI void U_EXPORT2 0416 ucsdet_setDetectableCharset(UCharsetDetector *ucsd, const char *encoding, UBool enabled, UErrorCode *status); 0417 #endif /* U_HIDE_INTERNAL_API */ 0418 0419 #endif 0420 #endif /* __UCSDET_H */ 0421 0422
[ Source navigation ] | [ Diff markup ] | [ Identifier search ] | [ general search ] |
This page was automatically generated by the 2.3.7 LXR engine. The LXR team |