Clean filenames

As soon as you have people uploading files with all kind of funky characters and umlauts in it, you might run into troubles once you have to download them again.

Same if you create files on the fly that contain for example the name of a person or a company.

This was exactly the situation we had, so I came up with a little cleaning function in C#.

Feel free to adjust the mapping list according to your requirements. If mapped some  of the more common special chars to the minus sign (-) and tried to get a sensible mapping for all the umlauts and other special characters.

The method CleanFileName() first maps all the known chars and then gets rid of everything that is left.

 
//http://www.pjb.com.au/comp/diacritics.html
private static string[,] CharacterReplacements = { 
    { " ", "-"},
    { "&", "-"},
    { "?", "-"},
    { "!", "-"},
    { "%", "-"},
    { "+", "-"},
    { "#", "-"},
    { ":", "-"},
    { ";", "-"},
    { ".", "-"},
 
    { "¢", "c" },   //cent
    { "£", "P" },   //Pound
    { "€", "E" },   //Euro
    { "¥", "Y" },   //Yen
    { "°", "d" },   //degree
    { "¼", "1-4" }, //fraction one-quarter
    { "½", "1-2" }, //fraction half    
    { "¾", "1-3" }, //fraction three-quarters}
    { "@", "AT)"}, //at                                                  
    { "Œ", "OE" },  //OE ligature, French (in ISO-8859-15)        
    { "œ", "oe" },  //OE ligature, French (in ISO-8859-15)        
 
    {"Å","A" },  //ring
    {"Æ","AE"},  //diphthong
    {"Ç","C" },  //cedilla
    {"È","E" },  //grave accent
    {"É","E" },  //acute accent
    {"Ê","E" },  //circumflex accent
    {"Ë","E" },  //umlaut mark
    {"Ì","I" },  //grave accent
    {"Í","I" },  //acute accent
    {"Î","I" },  //circumflex accent
    {"Ï","I" },  //umlaut mark
    {"Ð","Eth"}, //Icelandic
    {"Ñ","N" },  //tilde
    {"Ò","O" },  //grave accent
    {"Ó","O" },  //acute accent
    {"Ô","O" },  //circumflex accent
    {"Õ","O" },  //tilde
    {"Ö","O" },  //umlaut mark
    {"Ø","O" },  //slash
    {"Ù","U" },  //grave accent
    {"Ú","U" },  //acute accent
    {"Û","U" },  //circumflex accent
    {"Ü","U" },  //umlaut mark
    {"Ý","Y" },  //acute accent
    {"Þ","eth"}, //Icelandic - http://en.wikipedia.org/wiki/Thorn_(letter)
    {"ß","ss"},  //German
 
    {"à","a" },  //grave accent
    {"á","a" },  //acute accent
    {"â","a" },  //circumflex accent
    {"ã","a" },  //tilde
    {"ä","ae"},  //umlaut mark
    {"å","a" },  //ring
    {"æ","ae"},  //diphthong
    {"ç","c" },  //cedilla
    {"è","e" },  //grave accent
    {"é","e" },  //acute accent
    {"ê","e" },  //circumflex accent
    {"ë","e" },  //umlaut mark
    {"ì","i" },  //grave accent
    {"í","i" },  //acute accent
    {"î","i" },  //circumflex accent
    {"ï","i" },  //umlaut mark
    {"ð","eth"}, //Icelandic
    {"ñ","n" },  //tilde
    {"ò","o" },  //grave accent
    {"ó","o" },  //acute accent
    {"ô","o" },  //circumflex accent
    {"õ","o" },  //tilde
    {"ö","oe"},  //umlaut mark
    {"ø","o" },  //slash
    {"ù","u" },  //grave accent
    {"ú","u" },  //acute accent
    {"û","u" },  //circumflex accent
    {"ü","ue"},  //umlaut mark
    {"ý","y" },  //acute accent
    {"þ","eth"}, //Icelandic - http://en.wikipedia.org/wiki/Thorn_(letter)
    {"ÿ","y" },  //umlaut mark
    };
 
 
 
//http://stackoverflow.com/questions/3885964/regex-to-replace-invalid-characters
public static string RemoveNonWordChars(string source)
{
    return RemoveNonWordChars(source, "");
}
 
 
//http://stackoverflow.com/questions/3885964/regex-to-replace-invalid-characters
public static string RemoveNonWordChars(string source, string replacement)
{
    //\W is any non-word character (not [^a-zA-Z0-9_]).
    Regex regex = new Regex(@"[^a-zA-Z0-9-]+");
    return regex.Replace(source, replacement);
}
 
 
public static string CleanFileName(string filename)
{
    string fileEnding = null;
    int index = filename.LastIndexOf(".");
 
    //removes the file ending.
    if (index != -1)
    {
        fileEnding = filename.Substring(index + 1);
        filename   = filename.Substring(0, index);
 
        //remove based on the CharacterReplacements list
        for (int i = 0; i < CharacterReplacements.GetLength(0); i++)
        {
            fileEnding = fileEnding.Replace(CharacterReplacements[i, 0], CharacterReplacements[i, 1]);
        }
 
        //remove everything that is left
        fileEnding = "." + StringUtil.RemoveNonWordChars(fileEnding);
    }
 
    //remove based on the CharacterReplacements list
    for (int i = 0; i < CharacterReplacements.GetLength(0); i++)
    {
        filename = filename.Replace(CharacterReplacements[i, 0], CharacterReplacements[i, 1]);
    }
 
    //remove everything that is left
    filename   = StringUtil.RemoveNonWordChars(filename);
 
    return filename + fileEnding;
}

You could use HttpUtility.UrlEncode, but that does not solve all the issues that are solved with my solution. And it should work in all browsers. My tests with UrlEncode worked fine in Chrome, but not in IE.




Ähnliche Beiträge


Leave a Reply

Your email address will not be published. Required fields are marked *



*