JavedA's picture
unable to cross ref for quarto bug V_1
2e2cacf
raw
history blame
20.3 kB
-- puremagic 1.0.1
-- Copyright (c) 2014 Will Bond <[email protected]>
-- Licensed under the MIT license.
function basename(path)
local basename_match = path:match('[/\\]([^/\\]+)$')
if basename_match then
return basename_match, nil
end
return path, nil
end
function extension(path)
path = path:lower()
local tar_match = path:match('%.(tar%.[^.]+)$')
if tar_match then
return tar_match
end
if path:sub(#path - 11, #path) == '.numbers.zip' then
return 'numbers.zip'
end
if path:sub(#path - 9, #path) == '.pages.zip' then
return 'pages.zip'
end
if path:sub(#path - 7, #path) == '.key.zip' then
return 'key.zip'
end
return path:match('%.([^.]+)$')
end
function in_table(value, list)
for i=1, #list do
if list[i] == value then
return true
end
end
return false
end
function string_to_bit_table(chars)
local output = {}
for char in chars:gmatch('.') do
local num = string.byte(char)
local bits = {0, 0, 0, 0, 0, 0, 0, 0}
for bit=8, 1, -1 do
if num > 0 then
bits[bit] = math.fmod(num, 2)
num = (num - bits[bit]) / 2
end
end
table.insert(output, bits)
end
return output
end
function bit_table_to_string(bits)
local output = {}
for i = 1, #bits do
local num = tonumber(table.concat(bits[i]), 2)
table.insert(output, string.format('%c', num))
end
return table.concat(output)
end
function bitwise_and(a, b)
local a_bytes = string_to_bit_table(a)
local b_bytes = string_to_bit_table(b)
local output = {}
for i = 1, #a_bytes do
local bits = {0, 0, 0, 0, 0, 0, 0, 0}
for j = 1, 8 do
if a_bytes[i][j] == 1 and b_bytes[i][j] == 1 then
bits[j] = 1
else
bits[j] = 0
end
end
table.insert(output, bits)
end
return bit_table_to_string(output)
end
-- Unpack a little endian byte string into an integer
function unpack_le(chars)
local bit_table = string_to_bit_table(chars)
-- Merge the bits into a string of 1s and 0s
local result = {}
for i=1, #bit_table do
result[#chars + 1 - i] = table.concat(bit_table[i])
end
return tonumber(table.concat(result), 2)
end
-- Unpack a big endian byte string into an integer
function unpack_be(chars)
local bit_table = string_to_bit_table(chars)
-- Merge the bits into a string of 1s and 0s
for i=1, #bit_table do
bit_table[i] = table.concat(bit_table[i])
end
return tonumber(table.concat(bit_table), 2)
end
-- Takes the first 4-8k of an EBML file and identifies if it is matroska or webm
-- and it it contains just video or just audio.
function ebml_parse(content)
local position = 1
local length = #content
local header_token, header_value, used_bytes = ebml_parse_section(content)
position = position + used_bytes
if header_token ~= '\x1AE\xDF\xA3' then
return nil, 'Unable to find EBML ID'
end
-- The matroska spec sets the default doctype to be 'matroska', however
-- many file specify this anyway. The other option is 'webm'.
local doctype = 'matroska'
if header_value['B\x82'] then
doctype = header_value['B\x82']
end
if doctype ~= 'matroska' and doctype ~= 'webm' then
return nil, 'Unknown EBML doctype'
end
local segment_position = nil
local track_position = nil
local has_video = false
local found_tracks = false
while position <= length do
local ebml_id, ebml_value, used_bytes = ebml_parse_section(content:sub(position, length))
position = position + used_bytes
-- Segment
if ebml_id == '\x18S\x80g' then
segment_position = position
end
-- Meta seek information
if ebml_id == '\x11M\x9Bt' then
-- Look for the seek info about the tracks token
for i, child in ipairs(ebml_value['M\xBB']) do
if child['S\xAB'] == '\x16T\xAEk' then
track_position = segment_position + unpack_be(child['S\xAC'])
position = track_position
break
end
end
end
-- Track
if ebml_id == '\x16T\xAEk' then
found_tracks = true
-- Scan through each track looking for video
for i, child in ipairs(ebml_value['\xAE']) do
-- Look to see if the track type is video
if unpack_be(child['\x83']) == 1 then
has_video = true
break
end
end
break
end
end
if found_tracks and not has_video then
if doctype == 'matroska' then
return 'audio/x-matroska'
else
return 'audio/webm'
end
end
if doctype == 'matroska' then
return 'video/x-matroska'
else
return 'video/webm'
end
end
-- Parses a section of an EBML document, returning the EBML ID at the beginning,
-- plus the value as a table with child EBML IDs as keys and the number of
-- bytes from the content that contained the ID and value
function ebml_parse_section(content)
local ebml_id, element_length, used_bytes = ebml_id_and_length(content)
-- Don't parse the segment since it is the whole file!
if ebml_id == '\x18\x53\x80\x67' then
return ebml_id, nil, used_bytes
end
local ebml_value = content:sub(used_bytes + 1, used_bytes + element_length)
used_bytes = used_bytes + element_length
-- We always parse the return value of level 0/1 elements
local recursive_parse = false
if #ebml_id == 4 then
recursive_parse = true
-- We need Seek information
elseif ebml_id == '\x4D\xBB' then
recursive_parse = true
-- We want the top-level of TrackEntry to grab the TrackType
elseif ebml_id == '\xAE' then
recursive_parse = true
end
if recursive_parse then
local buffer = ebml_value
ebml_value = {}
-- Track which child entries have been converted to an array
local array_children = {}
while #buffer > 0 do
local child_ebml_id, child_ebml_value, child_used_bytes = ebml_parse_section(buffer)
if array_children[child_ebml_id] then
table.insert(ebml_value[child_ebml_id], child_ebml_value)
-- Single values are just stores by themselves
elseif ebml_value[child_ebml_id] == nil then
-- Force seek info and tracks to be arrays even if there is only one
if child_ebml_id == 'M\xBB' or child_ebml_id == '\xAE' then
child_ebml_value = {child_ebml_value}
array_children[child_ebml_id] = true
end
ebml_value[child_ebml_id] = child_ebml_value
-- If there is already a value for the ID, turn it into a table
else
ebml_value[child_ebml_id] = {ebml_value[child_ebml_id], child_ebml_value}
array_children[child_ebml_id] = true
end
-- Move past the part we've parsed
buffer = buffer:sub(child_used_bytes + 1, #buffer)
end
end
return ebml_id, ebml_value, used_bytes
end
-- Should accept 12+ bytes, will return the ebml id, the data length and the
-- number of bytes that were used to hold those values.
function ebml_id_and_length(chars)
-- The ID is encoded the same way as the length, however, we don't want
-- to remove the length bits from the ID value or intepret it as an
-- unsigned int since all of the documentation online references the IDs in
-- encoded form.
local _, id_length = ebml_length(chars:sub(1, 4))
local ebml_id = chars:sub(1, id_length)
local remaining = chars:sub(id_length + 1, id_length + 8)
local element_length, used_bytes = ebml_length(remaining)
return ebml_id, element_length, id_length + used_bytes
end
-- Should accept 8+ bytes, will return the data length plus the number of bytes
-- that were used to hold the data length.
function ebml_length(chars)
-- We substring chars to ensure we don't build a huge table we don't need
local bit_tables = string_to_bit_table(chars:sub(1, 8))
local value_length = 1
for i=1, #bit_tables[1] do
if bit_tables[1][i] == 0 then
value_length = value_length + 1
else
-- Clear the indicator bit so the rest of the byte
bit_tables[1][i] = 0
break
end
end
local bits = {}
for i=1, value_length do
table.insert(bits, table.concat(bit_tables[i]))
end
return tonumber(table.concat(bits), 2), value_length
end
function binary_tests(content, ext)
local length = #content
local _1_8 = content:sub(1, 8)
local _1_7 = content:sub(1, 7)
local _1_6 = content:sub(1, 6)
local _1_5 = content:sub(1, 5)
local _1_4 = content:sub(1, 4)
local _1_3 = content:sub(1, 3)
local _1_2 = content:sub(1, 2)
local _9_12 = content:sub(9, 12)
-- Images
if _1_4 == '\xC5\xD0\xD3\xC6' then
-- With a Windows-format EPS, the file starts right after a 30-byte
-- header, or a 30-byte header followed by two bytes of padding
if content:sub(33, 42) == '%!PS-Adobe' or content:sub(31, 40) == '%!PS-Adobe' then
return 'application/postscript'
end
end
if _1_8 == '%!PS-Ado' and content:sub(9, 10) == 'be' then
return 'application/postscript'
end
if _1_4 == 'MM\x00*' or _1_4 == 'II*\x00' then
return 'image/tiff'
end
if _1_8 == '\x89PNG\r\n\x1A\n' then
return 'image/png'
end
if _1_6 == 'GIF87a' or _1_6 == 'GIF89a' then
return 'image/gif'
end
if _1_4 == 'RIFF' and _9_12 == 'WEBP' then
return 'image/webp'
end
if _1_2 == 'BM' and length > 14 and in_table(content:sub(15, 15), {'\x0C', '(', '@', '\x80'}) then
return 'image/x-ms-bmp'
end
local normal_jpeg = length > 10 and in_table(content:sub(7, 10), {'JFIF', 'Exif'})
local photoshop_jpeg = length > 24 and _1_4 == '\xFF\xD8\xFF\xED' and content:sub(21, 24) == '8BIM'
if normal_jpeg or photoshop_jpeg then
return 'image/jpeg'
end
if _1_4 == '8BPS' then
return 'image/vnd.adobe.photoshop'
end
if _1_8 == '\x00\x00\x00\x0CjP ' and _9_12 == '\r\n\x87\n' then
return 'image/jp2'
end
if _1_4 == '\x00\x00\x01\x00' then
return 'application/vnd.microsoft.icon'
end
-- Audio/Video
if _1_4 == '\x1AE\xDF\xA3' and length > 1000 then
local mimetype, err = ebml_parse(content)
if mimetype then
return mimetype
end
end
if _1_4 == 'MOVI' then
if in_table(content:sub(5, 8), {'moov', 'mdat'}) then
return 'video/quicktime'
end
end
if length > 8 and content:sub(5, 8) == 'ftyp' then
local lower_9_12 = _9_12:lower()
if in_table(lower_9_12, {'avc1', 'isom', 'iso2', 'mp41', 'mp42', 'mmp4', 'ndsc', 'ndsh', 'ndsm', 'ndsp', 'ndss', 'ndxc', 'ndxh', 'ndxm', 'ndxp', 'ndxs', 'f4v ', 'f4p ', 'm4v '}) then
return 'video/mp4'
end
if in_table(lower_9_12, {'msnv', 'ndas', 'f4a ', 'f4b ', 'm4a ', 'm4b ', 'm4p '}) then
return 'audio/mp4'
end
if in_table(lower_9_12, {'3g2a', '3g2b', '3g2c', 'kddi'}) then
return 'video/3gpp2'
end
if in_table(lower_9_12, {'3ge6', '3ge7', '3gg6', '3gp1', '3gp2', '3gp3', '3gp4', '3gp5', '3gp6', '3gs7'}) then
return 'video/3gpp'
end
if lower_9_12 == 'mqt ' or lower_9_12 == 'qt ' then
return 'video/quicktime'
end
if lower_9_12 == 'jp2 ' then
return 'image/jp2'
end
end
-- MP3
if bitwise_and(_1_2, '\xFF\xF6') == '\xFF\xF2' then
local byte_3 = content:sub(3, 3)
if bitwise_and(byte_3, '\xF0') ~= '\xF0' and bitwise_and(byte_3, "\x0C") ~= "\x0C" then
return 'audio/mpeg'
end
end
if _1_3 == 'ID3' then
return 'audio/mpeg'
end
if _1_4 == 'fLaC' then
return 'audio/x-flac'
end
if _1_8 == '0&\xB2u\x8Ef\xCF\x11' then
-- Without writing a full-on ASF parser, we can just scan for the
-- UTF-16 string "AspectRatio"
if content:find('\x00A\x00s\x00p\x00e\x00c\x00t\x00R\x00a\x00t\x00i\x00o', 1, true) then
return 'video/x-ms-wmv'
end
return 'audio/x-ms-wma'
end
if _1_4 == 'RIFF' and _9_12 == 'AVI ' then
return 'video/x-msvideo'
end
if _1_4 == 'RIFF' and _9_12 == 'WAVE' then
return 'audio/x-wav'
end
if _1_4 == 'FORM' and _9_12 == 'AIFF' then
return 'audio/x-aiff'
end
if _1_4 == 'OggS' then
local _29_33 = content:sub(29, 33)
if _29_33 == '\x01vorb' then
return 'audio/vorbis'
end
if _29_33 == '\x07FLAC' then
return 'audio/x-flac'
end
if _29_33 == 'OpusH' then
return 'audio/ogg'
end
-- Theora and OGM
if _29_33 == '\x80theo' or _29_33 == 'vide' then
return 'video/ogg'
end
end
if _1_3 == 'FWS' or _1_3 == 'CWS' then
return 'application/x-shockwave-flash'
end
if _1_3 == 'FLV' then
return 'video/x-flv'
end
if _1_5 == '%PDF-' then
return 'application/pdf'
end
if _1_5 == '{\\rtf' then
return 'text/rtf'
end
-- Office '97-2003 formats
if _1_8 == '\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1' then
if in_table(ext, {'xls', 'csv', 'tab'}) then
return 'application/vnd.ms-excel'
end
if ext == 'ppt' then
return 'application/vnd.ms-powerpoint'
end
-- We default to word since we need something if the extension isn't recognized
return 'application/msword'
end
if _1_8 == '\x09\x04\x06\x00\x00\x00\x10\x00' then
return 'application/vnd.ms-excel'
end
if _1_6 == '\xDB\xA5\x2D\x00\x00\x00' or _1_5 == '\x50\x4F\x5E\x51\x60' or _1_4 == '\xFE\x37\x00\x23' or _1_3 == '\x94\xA6\x2E' then
return 'application/msword'
end
if _1_4 == 'PK\x03\x04' then
-- Office XML formats
if ext == 'xlsx' then
return 'application/vnd.ms-excel'
end
if ext == 'pptx' then
return 'application/vnd.ms-powerpoint'
end
if ext == 'docx' then
return 'application/msword'
end
-- Open Office formats
if ext == 'ods' then
return 'application/vnd.oasis.opendocument.spreadsheet'
end
if ext == 'odp' then
return 'application/vnd.oasis.opendocument.presentation'
end
if ext == 'odt' then
return 'application/vnd.oasis.opendocument.text'
end
-- iWork - some programs like Mac Mail change the filename to
-- .numbers.zip, etc
if ext == 'pages' or ext == 'pages.zip' then
return 'application/vnd.apple.pages'
end
if ext == 'key' or ext == 'key.zip' then
return 'application/vnd.apple.keynote'
end
if ext == 'numbers' or ext == 'numbers.zip' then
return 'application/vnd.apple.numbers'
end
-- Otherwise just a zip
return 'application/zip'
end
-- Archives
if length > 257 then
if content:sub(258, 263) == 'ustar\x00' then
return 'application/x-tar'
end
if content:sub(258, 265) == 'ustar\x40\x40\x00' then
return 'application/x-tar'
end
end
if _1_7 == 'Rar!\x1A\x07\x00' or _1_8 == 'Rar!\x1A\x07\x01\x00' then
return 'application/x-rar-compressed'
end
if _1_2 == '\x1F\x9D' then
return 'application/x-compress'
end
if _1_2 == '\x1F\x8B' then
return 'application/x-gzip'
end
if _1_3 == 'BZh' then
return 'application/x-bzip2'
end
if _1_6 == '\xFD7zXZ\x00' then
return 'application/x-xz'
end
if _1_6 == '7z\xBC\xAF\x27\x1C' then
return 'application/x-7z-compressed'
end
if _1_2 == 'MZ' then
local pe_header_start = unpack_le(content:sub(61, 64))
local signature = content:sub(pe_header_start + 1, pe_header_start + 4)
if signature == 'PE\x00\x00' then
local image_file_header_start = pe_header_start + 5
local characteristics = content:sub(image_file_header_start + 18, image_file_header_start + 19)
local is_dll = bitwise_and(characteristics, '\x20\x00') == '\x20\x00'
if is_dll then
return 'application/x-msdownload'
end
return 'application/octet-stream'
end
end
return nil
end
function text_tests(content)
local lower_content = content:lower()
if content:find('^%%!PS-Adobe') then
return 'application/postscript'
end
if lower_content:find('<?php', 1, true) or content:find('<?=', 1, true) then
return 'application/x-httpd-php'
end
if lower_content:find('^%s*<%?xml') then
if content:find('<svg') then
return 'image/svg+xml'
end
if lower_content:find('<!doctype html') then
return 'application/xhtml+xml'
end
if content:find('<rss') then
return 'application/rss+xml'
end
return 'application/xml'
end
if lower_content:find('^%s*<html') or lower_content:find('^%s*<!doctype') then
return 'text/html'
end
if lower_content:find('^#![/a-z0-9]+ ?python') then
return 'application/x-python'
end
if lower_content:find('^#![/a-z0-9]+ ?perl') then
return 'application/x-perl'
end
if lower_content:find('^#![/a-z0-9]+ ?ruby') then
return 'application/x-ruby'
end
if lower_content:find('^#![/a-z0-9]+ ?php') then
return 'application/x-httpd-php'
end
if lower_content:find('^#![/a-z0-9]+ ?bash') then
return 'text/x-shellscript'
end
return nil
end
local ext_map = {
css = 'text/css',
csv = 'text/csv',
htm = 'text/html',
html = 'text/html',
xhtml = 'text/html',
ics = 'text/calendar',
js = 'application/javascript',
php = 'application/x-httpd-php',
php3 = 'application/x-httpd-php',
php4 = 'application/x-httpd-php',
php5 = 'application/x-httpd-php',
inc = 'application/x-httpd-php',
pl = 'application/x-perl',
cgi = 'application/x-perl',
py = 'application/x-python',
rb = 'application/x-ruby',
rhtml = 'application/x-ruby',
rss = 'application/rss+xml',
sh = 'text/x-shellscript',
tab = 'text/tab-separated-values',
vcf = 'text/x-vcard',
xml = 'application/xml'
}
function ext_tests(ext)
local mimetype = ext_map[ext]
if mimetype then
return mimetype
end
return 'text/plain'
end
local _M = {}
function _M.via_path(path, filename)
local f, err = io.open(path, 'r')
if not f then
return nil, err
end
local content = f:read(4096)
f:close()
if not filename then
filename = basename(path)
end
return _M.via_content(content, filename)
end
function _M.via_content(content, filename)
local ext = extension(filename)
-- If there are no low ASCII chars and no easily distinguishable tokens,
-- we need to detect by file extension
local mimetype = nil
mimetype = binary_tests(content, ext)
if mimetype then
return mimetype
end
-- Binary-looking files should have been detected so far
if content:find('[%z\x01-\x08\x0B\x0C\x0E-\x1F]') then
return 'application/octet-stream'
end
mimetype = text_tests(content)
if mimetype then
return mimetype
end
return ext_tests(ext)
end
return _M